blob: 352efd327f88ab9dceeaf6b096d4e6c70250c9f6 [file] [log] [blame]
Marc Kupietz4b799e92018-01-02 11:04:56 +01001#define EXPORT __attribute__((visibility("visible")))
2#define IMPORT
Marc Kupietz12af0192021-03-13 18:05:14 +01003
Marc Kupietz5ffc4742024-11-15 15:45:12 +01004#include <cassert>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01005#include <memory>
6#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01007#include <algorithm>
8#include <vector>
Marc Kupietz5ffc4742024-11-15 15:45:12 +01009#include <cstdint>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010010#include <string>
11#include <sstream> // for ostringstream
Marc Kupietz5ffc4742024-11-15 15:45:12 +010012#include <cmath>
Marc Kupietzd31254c2018-01-20 21:29:30 +010013#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010014#include "rocksdb/db.h"
15#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010016#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010017#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include "merge_operators.h"
Marc Kupietz44229232024-08-05 15:00:20 +020020#include "export.h"
Marc Kupietz6208fd72024-11-15 15:46:19 +010021#include "config.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010022
Marc Kupietz75af60f2019-01-22 22:34:29 +010023#define WINDOW_SIZE 5
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010024#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010025#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
26#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010027#define W1(key) (uint64_t)(key & 0xffffff)
28#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
29#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010030
31typedef struct {
32 uint64_t freq;
33 char *word;
Marc Kupietz12af0192021-03-13 18:05:14 +010034} vocab_entry;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010035
36// typedef struct Collocator {
37// uint64_t w2;
38// uint64_t sum;
39// };
40
Marc Kupietz28cc53e2017-12-23 17:24:55 +010041using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010042using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043
Marc Kupietz4b799e92018-01-02 11:04:56 +010044namespace rocksdb {
Marc Kupietz12af0192021-03-13 18:05:14 +010045 class Collocator {
46 public:
47 uint32_t w2;
48 uint64_t f2;
49 uint64_t raw;
50 double pmi;
51 double npmi;
52 double llr;
53 double lfmd;
54 double md;
55 uint64_t left_raw;
56 uint64_t right_raw;
57 double left_pmi;
58 double right_pmi;
59 double dice;
60 double logdice;
61 double ldaf;
62 int window;
63 int af_window;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010064 };
65
Marc Kupietz28cc53e2017-12-23 17:24:55 +010066 size_t num_merge_operator_calls;
Marc Kupietz12af0192021-03-13 18:05:14 +010067
Marc Kupietz28cc53e2017-12-23 17:24:55 +010068 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010069
Marc Kupietz28cc53e2017-12-23 17:24:55 +010070 size_t num_partial_merge_calls;
Marc Kupietz12af0192021-03-13 18:05:14 +010071
Marc Kupietz28cc53e2017-12-23 17:24:55 +010072 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010073
74
Marc Kupietz12af0192021-03-13 18:05:14 +010075 inline void EncodeFixed64(char *buf, uint64_t value) {
76 if (!IS_BIG_ENDIAN) {
Marc Kupietz4b799e92018-01-02 11:04:56 +010077 memcpy(buf, &value, sizeof(value));
78 } else {
79 buf[0] = value & 0xff;
80 buf[1] = (value >> 8) & 0xff;
81 buf[2] = (value >> 16) & 0xff;
82 buf[3] = (value >> 24) & 0xff;
83 buf[4] = (value >> 32) & 0xff;
84 buf[5] = (value >> 40) & 0xff;
85 buf[6] = (value >> 48) & 0xff;
86 buf[7] = (value >> 56) & 0xff;
87 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010088 }
89
Marc Kupietz12af0192021-03-13 18:05:14 +010090 inline uint32_t DecodeFixed32(const char *ptr) {
91 if (!IS_BIG_ENDIAN) {
Marc Kupietz4b799e92018-01-02 11:04:56 +010092 // Load the raw bytes
93 uint32_t result;
94 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
95 return result;
96 } else {
97 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
98 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
99 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
100 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
101 }
102 }
103
Marc Kupietz12af0192021-03-13 18:05:14 +0100104 inline uint64_t DecodeFixed64(const char *ptr) {
105 if (!IS_BIG_ENDIAN) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100106 // Load the raw bytes
107 uint64_t result;
108 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
109 return result;
110 } else {
111 uint64_t lo = DecodeFixed32(ptr);
112 uint64_t hi = DecodeFixed32(ptr + 4);
113 return (hi << 32) | lo;
114 }
115 }
116
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100117 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100118 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100119 r1 = f1 * window_size,
120 c1 = f2,
121 e = r1 * c1 / total,
122 o = f12;
123 if (f12 < FREQUENCY_THRESHOLD)
Marc Kupietzf4a649a2021-02-26 09:18:01 +0100124 return -1.0;
125 else
Marc Kupietz12af0192021-03-13 18:05:14 +0100126 return log2(o / e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100127 }
128
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200129 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
130 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100131 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100132 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100133 r1 = f1 * window_size,
134 c1 = f2,
135 e = r1 * c1 / total,
136 o = f12;
137 if (f12 < FREQUENCY_THRESHOLD)
Marc Kupietz8caf9912018-06-05 10:51:18 +0200138 return -1.0;
139 else
Marc Kupietz12af0192021-03-13 18:05:14 +0100140 return log2(o / e) / (-log2(o / total / window_size));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100141 }
142
143 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
144 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
145 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
146 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100147 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100148 r1 = f1 * window_size,
149 c1 = f2,
150 e = r1 * c1 / total,
151 o = f12;
152 return log2(o * o / e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100153 }
154
155 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100156 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100157 r1 = f1 * window_size,
158 c1 = f2,
159 e = r1 * c1 / total,
160 o = f12;
161 if (f12 == 0)
Marc Kupietz8caf9912018-06-05 10:51:18 +0200162 return 0;
163 else
Marc Kupietz12af0192021-03-13 18:05:14 +0100164 return log2(o * o * o / e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100165 }
166
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100167 // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
168 // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
169 static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
170 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100171 r1 = (double) w1 * window_size,
172 r2 = (double) n - r1,
173 c1 = w2,
174 c2 = n - c1,
175 o11 = w12, o12 = r1 - o11,
176 o21 = c1 - w12, o22 = r2 - o21,
177 e11 = r1 * c1 / n, e12 = r1 * c2 / n,
178 e21 = r2 * c1 / n, e22 = r2 * c2 / n;
179 return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) + (o12 > 0 ? o12 * log(o12 / e12) : 0) +
180 (o21 > 0 ? o21 * log(o21 / e21) : 0) + (o22 > 0 ? o22 * log(o22 / e22) : 0)));
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100181 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100182
Marc Kupietz41880452019-01-22 15:29:06 +0100183
184 static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
185 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100186 r1 = (double) w1 * window_size,
187 c1 = w2;
188 return 2 * w12 / (c1 + r1);
Marc Kupietz41880452019-01-22 15:29:06 +0100189 }
190
191 // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
192 static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
193 double
Marc Kupietz12af0192021-03-13 18:05:14 +0100194 r1 = (double) w1 * window_size,
195 c1 = w2;
196 return 14 + log2(2 * w12 / (c1 + r1));
Marc Kupietz41880452019-01-22 15:29:06 +0100197 }
198
Marc Kupietz4b799e92018-01-02 11:04:56 +0100199 class CountMergeOperator : public AssociativeMergeOperator {
200 public:
Marc Kupietz12af0192021-03-13 18:05:14 +0100201 CountMergeOperator() {
202 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
203 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100204
Marc Kupietz12af0192021-03-13 18:05:14 +0100205 virtual bool Merge(const Slice &key,
206 const Slice *existing_value,
207 const Slice &value,
208 std::string *new_value,
209 Logger *logger) const override {
210 assert(new_value->empty());
211 ++num_merge_operator_calls;
212 if (existing_value == nullptr) {
213 new_value->assign(value.data(), value.size());
214 return true;
215 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100216
Marc Kupietz12af0192021-03-13 18:05:14 +0100217 return mergeOperator_->PartialMerge(
218 key,
219 *existing_value,
220 value,
221 new_value,
222 logger);
223 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100224
Marc Kupietz12af0192021-03-13 18:05:14 +0100225 virtual const char *Name() const override {
226 return "UInt64AddOperator";
227 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100228
Marc Kupietz4b799e92018-01-02 11:04:56 +0100229 private:
Marc Kupietz12af0192021-03-13 18:05:14 +0100230 std::shared_ptr<MergeOperator> mergeOperator_;
Marc Kupietz4b799e92018-01-02 11:04:56 +0100231 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100232
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100233
Marc Kupietz4b799e92018-01-02 11:04:56 +0100234 class CollocatorIterator : public Iterator {
235 private:
Marc Kupietz12af0192021-03-13 18:05:14 +0100236 char prefixc[sizeof(uint64_t)];
237 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100238
239
Marc Kupietz4b799e92018-01-02 11:04:56 +0100240 public:
Marc Kupietz12af0192021-03-13 18:05:14 +0100241 CollocatorIterator(Iterator *base_iterator)
242 : base_iterator_(base_iterator) {}
Marc Kupietz4b799e92018-01-02 11:04:56 +0100243
Marc Kupietz12af0192021-03-13 18:05:14 +0100244 void setPrefix(char *prefix) {
245 memcpy(prefixc, prefix, sizeof(uint64_t));
246 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100247
Marc Kupietz12af0192021-03-13 18:05:14 +0100248 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
249
250 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
251
252 virtual void Seek(const rocksdb::Slice &s) { base_iterator_->Seek(s); }
253
254 virtual void
255 SeekForPrev(const rocksdb::Slice &s) { base_iterator_->SeekForPrev(s); }
256
257 virtual void Prev() { base_iterator_->Prev(); }
258
259 virtual void Next() { base_iterator_->Next(); }
260
261 virtual Slice key() const;
262
263 virtual Slice value() const;
264
265 virtual Status status() const;
266
267 virtual bool Valid() const;
268
269 bool isValid();
270
271 uint64_t intValue();
272
273 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100274
Marc Kupietz4b799e92018-01-02 11:04:56 +0100275 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100276
Marc Kupietz4b799e92018-01-02 11:04:56 +0100277 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100278
Marc Kupietz4b799e92018-01-02 11:04:56 +0100279 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz12af0192021-03-13 18:05:14 +0100280 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100281 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100282
Marc Kupietz4b799e92018-01-02 11:04:56 +0100283 bool rocksdb::CollocatorIterator::isValid() {
Marc Kupietz12af0192021-03-13 18:05:14 +0100284 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100285 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100286 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100287
Marc Kupietz4b799e92018-01-02 11:04:56 +0100288 uint64_t rocksdb::CollocatorIterator::intKey() {
289 return DecodeFixed64(base_iterator_->key().data());
290 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100291
Marc Kupietz4b799e92018-01-02 11:04:56 +0100292 uint64_t rocksdb::CollocatorIterator::intValue() {
293 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100294 }
295
Marc Kupietz37359b12018-01-09 21:11:37 +0100296 class VocabEntry {
297 public:
Marc Kupietz12af0192021-03-13 18:05:14 +0100298 string word;
299 uint64_t freq;
Marc Kupietz37359b12018-01-09 21:11:37 +0100300 };
301
Marc Kupietz6aec7682018-01-10 09:47:48 +0100302 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100303 private:
Marc Kupietz12af0192021-03-13 18:05:14 +0100304 WriteOptions merge_option_; // for merge
305 char _one[sizeof(uint64_t)];
306 Slice _one_slice;
307 vector<VocabEntry> _vocab;
308 uint64_t total = 0;
309 uint64_t sentences = 0;
310 float avg_window_size = 8.0;
311
Marc Kupietz4b799e92018-01-02 11:04:56 +0100312 protected:
Marc Kupietz12af0192021-03-13 18:05:14 +0100313 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100314
Marc Kupietz12af0192021-03-13 18:05:14 +0100315 WriteOptions put_option_;
316 ReadOptions get_option_;
317 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100318
Marc Kupietz12af0192021-03-13 18:05:14 +0100319 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100320
Marc Kupietz12af0192021-03-13 18:05:14 +0100321 std::shared_ptr<DB> OpenDb(const char *dbname);
322
323 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
324
Marc Kupietz12af0192021-03-13 18:05:14 +0100325
Marc Kupietz4b799e92018-01-02 11:04:56 +0100326 public:
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100327 void readVocab(string fname);
Marc Kupietz12af0192021-03-13 18:05:14 +0100328 string getWord(uint32_t w1);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100329
Marc Kupietz979580e2024-11-21 18:05:07 +0100330 uint64_t getWordId(const char *word) const;
331
Marc Kupietz12af0192021-03-13 18:05:14 +0100332 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100333
Marc Kupietz12af0192021-03-13 18:05:14 +0100334 // public interface of CollocatorDB.
335 // All four functions return false
336 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100337
Marc Kupietz12af0192021-03-13 18:05:14 +0100338 // mapped to a levedb Put
339 bool set(const std::string &key, uint64_t value) {
340 // just treat the internal rep of int64 as the string
341 char buf[sizeof(value)];
342 EncodeFixed64(buf, value);
343 Slice slice(buf, sizeof(value));
344 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100345
Marc Kupietz12af0192021-03-13 18:05:14 +0100346 if (s.ok()) {
347 return true;
348 } else {
349 std::cerr << s.ToString() << std::endl;
350 return false;
351 }
352 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100353
Marc Kupietz12af0192021-03-13 18:05:14 +0100354 DB *getDb() {
355 return db_.get();
356 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100357
Marc Kupietz12af0192021-03-13 18:05:14 +0100358 // mapped to a rocksdb Delete
359 bool remove(const std::string &key) {
360 auto s = db_->Delete(delete_option_, key);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100361
Marc Kupietz12af0192021-03-13 18:05:14 +0100362 if (s.ok()) {
363 return true;
364 } else {
365 std::cerr << s.ToString() << std::endl;
366 return false;
367 }
368 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100369
Marc Kupietz12af0192021-03-13 18:05:14 +0100370 // mapped to a rocksdb Get
371 bool get(const std::string &key, uint64_t *value) {
372 std::string str;
373 auto s = db_->Get(get_option_, key, &str);
374
375 if (s.IsNotFound()) {
376 // return default value if not found;
377 *value = default_;
378 return true;
379 } else if (s.ok()) {
380 // deserialization
381 if (str.size() != sizeof(uint64_t)) {
382 std::cerr << "value corruption\n";
383 return false;
384 }
385 *value = DecodeFixed64(&str[0]);
386 return true;
387 } else {
388 std::cerr << s.ToString() << std::endl;
389 return false;
390 }
391 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100392
393
Marc Kupietz12af0192021-03-13 18:05:14 +0100394 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
395 char encoded_key[sizeof(uint64_t)];
396 EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
397 uint64_t value = default_;
398 get(std::string(encoded_key, 8), &value);
399 return value;
400 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100401
Marc Kupietz12af0192021-03-13 18:05:14 +0100402 virtual void inc(const std::string &key) {
403 db_->Merge(merge_option_, key, _one_slice);
404 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100405
Marc Kupietz12af0192021-03-13 18:05:14 +0100406 void inc(const uint64_t key) {
407 char encoded_key[sizeof(uint64_t)];
408 EncodeFixed64(encoded_key, key);
409 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
410 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100411
Marc Kupietz12af0192021-03-13 18:05:14 +0100412 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz8c62c372019-01-31 12:21:01 +0100413
Marc Kupietz12af0192021-03-13 18:05:14 +0100414 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100415
Marc Kupietz12af0192021-03-13 18:05:14 +0100416 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100417
Marc Kupietz12af0192021-03-13 18:05:14 +0100418 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100419
Marc Kupietz12af0192021-03-13 18:05:14 +0100420 vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
421
422 vector<Collocator>
423 get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2);
424
425 void
426 applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
427 const uint64_t sum, const int usedPositions,
428 int true_window_size, rocksdb::Collocator *result);
429
430 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
431
432 string collocators2json(uint32_t w1, vector<Collocator> collocators);
433
434 // mapped to a rocksdb Merge operation
435 virtual bool add(const std::string &key, uint64_t value) {
436 char encoded[sizeof(uint64_t)];
437 EncodeFixed64(encoded, value);
438 Slice slice(encoded, sizeof(uint64_t));
439 auto s = db_->Merge(merge_option_, key, slice);
440
441 if (s.ok()) {
442 return true;
443 } else {
444 std::cerr << s.ToString() << std::endl;
445 return false;
446 }
447 }
448
449 CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100450 };
451
Marc Kupietz6aec7682018-01-10 09:47:48 +0100452 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100453 // merge_option_.sync = true;
454 if (read_only)
Marc Kupietz88d116b2021-03-13 18:05:14 +0100455 db_ = OpenDbForRead(strdup(db_name));
Marc Kupietz6bb27762018-01-09 17:53:01 +0100456 else
457 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100458 assert(db_);
459 uint64_t one = 1;
460 EncodeFixed64(_one, one);
461 _one_slice = Slice(_one, sizeof(uint64_t));
462 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100463
Marc Kupietz6aec7682018-01-10 09:47:48 +0100464 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100465 inc(encodeCollocation(w1, w2, dist));
466 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100467
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100468 void rocksdb::CollocatorDB::readVocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100469 char strbuf[2048];
470 uint64_t freq;
471 FILE *fin = fopen(fname.c_str(), "rb");
472 if (fin == NULL) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100473 cout << "Vocabulary file " << fname << " not found\n";
Marc Kupietz37359b12018-01-09 21:11:37 +0100474 exit(1);
475 }
476 uint64_t i = 0;
„feldmueller“2441f7c2024-11-14 16:31:30 +0100477 while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100478 _vocab.push_back({strbuf, freq});
479 total += freq;
480 i++;
481 }
482 fclose(fin);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100483
484 char size_fname[256];
485 strcpy(size_fname, fname.c_str());
486 char *pos = strstr(size_fname, ".vocab");
Marc Kupietz12af0192021-03-13 18:05:14 +0100487 if (pos) {
488 *pos = 0;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100489 strcat(size_fname, ".size");
490 FILE *fp = fopen(size_fname, "r");
491 if (fp != NULL) {
492 fscanf(fp, "%lu", &sentences);
493 fscanf(fp, "%lu", &total);
Marc Kupietz12af0192021-03-13 18:05:14 +0100494 float sl = (float) total / (float) sentences;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100495 float w = WINDOW_SIZE;
Marc Kupietz12af0192021-03-13 18:05:14 +0100496 avg_window_size = ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double) w * (3 * w - 1)) / sl;
497 fprintf(stdout,
498 "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n",
499 total, sentences, sl, avg_window_size);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100500 fclose(fp);
501 } else {
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100502 // std::cout << "size file " << size_fname << " not found\n";
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100503 }
504 } else {
Marc Kupietz12af0192021-03-13 18:05:14 +0100505 std::cout << "cannot determine size file " << size_fname << "\n";
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100506 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100507 }
508
Marc Kupietz6aec7682018-01-10 09:47:48 +0100509 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100510 DB *db;
511 Options options;
512 options.env->SetBackgroundThreads(4);
513 options.create_if_missing = true;
514 options.merge_operator = std::make_shared<CountMergeOperator>();
515 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100516 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
Marc Kupietz12af0192021-03-13 18:05:14 +0100517 options.IncreaseParallelism();
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100518 options.OptimizeLevelStyleCompaction();
519 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100520 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100521 dbname << name << ".rocksdb";
Marc Kupietz12af0192021-03-13 18:05:14 +0100522 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
523 if (!s.ok()) {
524 std::cerr << s.ToString() << std::endl;
525 assert(false);
526 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100527 vocabname << name << ".vocab";
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100528 readVocab(vocabname.str());
Marc Kupietz12af0192021-03-13 18:05:14 +0100529 return std::shared_ptr<DB>(db);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100530 }
531
Marc Kupietz6aec7682018-01-10 09:47:48 +0100532 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100533 DB *db;
534 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100535
536
Marc Kupietz12af0192021-03-13 18:05:14 +0100537 options.env->SetBackgroundThreads(4);
538 options.create_if_missing = true;
539 options.merge_operator = std::make_shared<CountMergeOperator>();
540 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100541 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
Marc Kupietz12af0192021-03-13 18:05:14 +0100542 options.IncreaseParallelism();
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100543 options.OptimizeLevelStyleCompaction();
544 // options.max_write_buffer_number = 48;
545 // options.max_background_jobs = 48;
546 // options.allow_concurrent_memtable_write=true;
Marc Kupietz12af0192021-03-13 18:05:14 +0100547 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
548 // options.enable_write_thread_adaptive_yield = 1;
549 // options.allow_concurrent_memtable_write = 1;
550 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
551 // options.write_buffer_size = 1 << 22;
552 // options.allow_mmap_reads = true;
553 // options.allow_mmap_writes = true;
554 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100555 // BlockBasedTableOptions table_options;
556 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
Marc Kupietz12af0192021-03-13 18:05:14 +0100557 // options.bloom_locality = 1;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100558 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
559 // table_options.block_cache = cache;
Marc Kupietz12af0192021-03-13 18:05:14 +0100560 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
561 Status s;
562 // DestroyDB(dbname, Options());
563 s = DB::Open(options, dbname, &db);
564 if (!s.ok()) {
565 std::cerr << s.ToString() << std::endl;
566 assert(false);
567 }
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100568 total = 1000;
Marc Kupietz12af0192021-03-13 18:05:14 +0100569 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100570 }
571
Marc Kupietz12af0192021-03-13 18:05:14 +0100572 CollocatorIterator *rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100573 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100574 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100575 char prefixc[sizeof(uint64_t)];
576 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
577 Iterator *it = db_->NewIterator(options);
578 CollocatorIterator *cit = new CollocatorIterator(it);
Marc Kupietz88d116b2021-03-13 18:05:14 +0100579 if (w2 > 0)
580 cit->Seek(std::string(prefixc, 6));
581 else
582 cit->Seek(std::string(prefixc, 3));
Marc Kupietz18375e12017-12-24 10:11:18 +0100583 cit->setPrefix(prefixc);
584 return cit;
585 }
586
Marc Kupietz12af0192021-03-13 18:05:14 +0100587 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100588 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
589 for (; it->isValid(); it->Next()) {
590 uint64_t value = it->intValue();
591 uint64_t key = it->intKey();
Marc Kupietz12af0192021-03-13 18:05:14 +0100592 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value
593 << std::endl;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100594 }
595 std::cout << "ready dumping\n";
596 }
597
Marc Kupietz12af0192021-03-13 18:05:14 +0100598 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
599
600 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
601
602 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
603
604 bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
605
Marc Kupietz3203e4c2019-02-04 12:42:45 +0100606 bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) { return lhs.ldaf > rhs.ldaf; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100607
Marc Kupietz8c62c372019-01-31 12:21:01 +0100608
Marc Kupietz12af0192021-03-13 18:05:14 +0100609 void rocksdb::CollocatorDB::applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
610 const uint64_t sum, const int usedPositions, int true_window_size,
611 rocksdb::Collocator *result) {
Marc Kupietz8c62c372019-01-31 12:21:01 +0100612 uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
613 double o = sum,
Marc Kupietz12af0192021-03-13 18:05:14 +0100614 r1 = f1 * true_window_size,
615 c1 = f2,
616 e = r1 * c1 / total,
617 pmi = log2(o / e),
618 md = log2(o * o / e),
619 lfmd = log2(o * o * o / e),
620 llr = ca_ll(f1, f2, sum, total, true_window_size);
621 double ld = ca_logdice(f1, f2, sum, total, true_window_size);
Marc Kupietz8c62c372019-01-31 12:21:01 +0100622
623 int bestWindow = usedPositions;
624 double bestAF = ld;
625 double currentAF;
626 // if(f1<75000000)
627 //#pragma omp parallel for reduction(max:bestAF)
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100628 // #pragma omp target teams distribute parallel for reduction(max:bestAF) map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
Marc Kupietz12af0192021-03-13 18:05:14 +0100629 for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
630 if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
631 uint64_t currentWindowSum = 0;
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100632 // #pragma omp target teams distribute parallel for reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
Marc Kupietz12af0192021-03-13 18:05:14 +0100633 for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
634 if (((1 << pos) & bitmask & usedPositions) != 0)
635 currentWindowSum += sumWindow[pos];
Marc Kupietz8c62c372019-01-31 12:21:01 +0100636 }
637 currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
Marc Kupietz12af0192021-03-13 18:05:14 +0100638 if (currentAF > bestAF) {
Marc Kupietz8c62c372019-01-31 12:21:01 +0100639 bestAF = currentAF;
640 bestWindow = bitmask;
641 }
642 }
643
Marc Kupietz0421d092021-03-13 18:05:14 +0100644 *result = {w2,
645 f2,
Marc Kupietz12af0192021-03-13 18:05:14 +0100646 sum,
Marc Kupietz0421d092021-03-13 18:05:14 +0100647 pmi,
Marc Kupietz12af0192021-03-13 18:05:14 +0100648 pmi / (-log2(o / total / true_window_size)),
Marc Kupietz0421d092021-03-13 18:05:14 +0100649 llr,
Marc Kupietz12af0192021-03-13 18:05:14 +0100650 lfmd,
651 md,
Marc Kupietz0421d092021-03-13 18:05:14 +0100652 sumWindow[WINDOW_SIZE],
Marc Kupietz12af0192021-03-13 18:05:14 +0100653 sumWindow[WINDOW_SIZE - 1],
Marc Kupietz6d9221d2021-02-26 09:34:40 +0100654 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
Marc Kupietz12af0192021-03-13 18:05:14 +0100655 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
Marc Kupietz8c62c372019-01-31 12:21:01 +0100656 ca_dice(f1, f2, sum, total, true_window_size),
657 ld,
658 bestAF,
659 usedPositions,
660 bestWindow
661 };
662
663 }
664
Marc Kupietz88d116b2021-03-13 18:05:14 +0100665 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2) {
Marc Kupietz75af60f2019-01-22 22:34:29 +0100666 std::vector<Collocator> collocators;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100667 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietz8c62c372019-01-31 12:21:01 +0100668 uint64_t maxv = 0, sum = 0;
Marc Kupietz12af0192021-03-13 18:05:14 +0100669 uint64_t *sumWindow = (uint64_t *) malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE);
670 memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
Marc Kupietzade33222019-01-22 22:52:44 +0100671 int true_window_size = 1;
Marc Kupietz12af0192021-03-13 18:05:14 +0100672 int usedPositions = 0;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100673
Marc Kupietz12af0192021-03-13 18:05:14 +0100674 if (w1 > _vocab.size()) {
675 std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
676 w1 -= _vocab.size();
677 }
678#ifdef DEBUG
679 std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
680#endif
681 // #pragma omp parallel num_threads(40)
682 // #pragma omp single
683 for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0)); it->isValid(); it->Next()) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100684 uint64_t value = it->intValue(),
Marc Kupietz12af0192021-03-13 18:05:14 +0100685 key = it->intKey();
686 if ((w2 = W2(key)) > max_w2)
Marc Kupietzbd966192018-10-13 14:14:37 +0200687 continue;
Marc Kupietz12af0192021-03-13 18:05:14 +0100688 if (last_w2 == 0xffffffffffffffff) last_w2 = w2;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100689 if (w2 != last_w2) {
Marc Kupietz75af60f2019-01-22 22:34:29 +0100690 if (sum >= FREQUENCY_THRESHOLD) {
Marc Kupietz8c62c372019-01-31 12:21:01 +0100691 collocators.push_back({});
Marc Kupietz12af0192021-03-13 18:05:14 +0100692 rocksdb::Collocator *result = &(collocators[collocators.size() - 1]);
693 // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions, true_window_size) shared(w1, result) if(sum > 1000000)
Marc Kupietz8c62c372019-01-31 12:21:01 +0100694 {
695 // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2 *WINDOW_SIZE);
696 // memcpy(nsw, sumWindow, sizeof(uint64_t) * 2 *WINDOW_SIZE);
697 applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions, true_window_size, result);
698 // free(nsw);
Marc Kupietz75af60f2019-01-22 22:34:29 +0100699 }
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100700 }
Marc Kupietz12af0192021-03-13 18:05:14 +0100701 memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
702 usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
703 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100704 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100705 maxv = value;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100706 sum = value;
Marc Kupietzade33222019-01-22 22:52:44 +0100707 true_window_size = 1;
Marc Kupietz12af0192021-03-13 18:05:14 +0100708 if (min_w2 == max_w2 && w2 != min_w2)
709 break;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100710 } else {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100711 sum += value;
Marc Kupietz12af0192021-03-13 18:05:14 +0100712 if (value > maxv)
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100713 maxv = value;
Marc Kupietz12af0192021-03-13 18:05:14 +0100714 usedPositions |= 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
715 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
Marc Kupietzade33222019-01-22 22:52:44 +0100716 true_window_size++;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100717 }
718 }
719
Marc Kupietz12af0192021-03-13 18:05:14 +0100720 // #pragma omp taskwait
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100721 sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
Marc Kupietz8c62c372019-01-31 12:21:01 +0100722
Marc Kupietz12af0192021-03-13 18:05:14 +0100723#ifdef DEBUG
Marc Kupietzd31254c2018-01-20 21:29:30 +0100724 int i=0;
725 for (Collocator c : collocators) {
726 if(i++>10) break;
Marc Kupietz8c62c372019-01-31 12:21:01 +0100727 std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word << "*"
Marc Kupietzd31254c2018-01-20 21:29:30 +0100728 << "\t f(w1):" << _vocab[w1].freq
729 << "\t f(w2):" << _vocab[c.w2].freq
Marc Kupietz51f93792018-01-25 08:51:01 +0100730 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100731 << "\t pmi:" << c.pmi
732 << "\t npmi:" << c.npmi
733 << "\t llr:" << c.llr
Marc Kupietz8c62c372019-01-31 12:21:01 +0100734 << "\t md:" << c.md
Marc Kupietzd31254c2018-01-20 21:29:30 +0100735 << "\t lfmd:" << c.lfmd
Marc Kupietzd31254c2018-01-20 21:29:30 +0100736 << "\t total:" << total
737 << std::endl;
738 }
Marc Kupietz12af0192021-03-13 18:05:14 +0100739#endif
Marc Kupietz8c62c372019-01-31 12:21:01 +0100740
741 return collocators;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100742 }
743
Marc Kupietz88d116b2021-03-13 18:05:14 +0100744
745 std::vector<Collocator> rocksdb::CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
746 return get_collocators(w1, w2, w2);
747 }
748
Marc Kupietz8c62c372019-01-31 12:21:01 +0100749 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
Marc Kupietz88d116b2021-03-13 18:05:14 +0100750 return get_collocators(w1, 0, UINT32_MAX);
Marc Kupietzbd966192018-10-13 14:14:37 +0200751 }
752
Marc Kupietz3400aa52018-06-05 10:28:55 +0200753 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100754 std::vector<Collocator> collocators;
Marc Kupietz3400aa52018-06-05 10:28:55 +0200755 std::stringstream stream;
756 uint64_t w2, last_w2 = 0xffffffffffffffff;
757 uint64_t maxv = 0, total_w1 = 0;
758 bool first = true;
Marc Kupietz12af0192021-03-13 18:05:14 +0100759 for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
Marc Kupietz3400aa52018-06-05 10:28:55 +0200760 uint64_t value = it->intValue(),
Marc Kupietz12af0192021-03-13 18:05:14 +0100761 key = it->intKey();
Marc Kupietz3400aa52018-06-05 10:28:55 +0200762 w2 = W2(key);
763 total_w1 += value;
Marc Kupietz12af0192021-03-13 18:05:14 +0100764 if (last_w2 == 0xffffffffffffffff) last_w2 = w2;
Marc Kupietz3400aa52018-06-05 10:28:55 +0200765 if (w2 != last_w2) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100766 if (maxv >= min_cooccur) {
767 double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
768 if (first)
Marc Kupietz3400aa52018-06-05 10:28:55 +0200769 first = false;
770 else
Marc Kupietz12af0192021-03-13 18:05:14 +0100771 stream << " ";
772 stream << w2 << " " << llr;
Marc Kupietz3400aa52018-06-05 10:28:55 +0200773 }
774 last_w2 = w2;
775 maxv = value;
776 } else {
Marc Kupietz12af0192021-03-13 18:05:14 +0100777 if (value > maxv)
Marc Kupietz3400aa52018-06-05 10:28:55 +0200778 maxv = value;
779 }
780 }
Marc Kupietz12af0192021-03-13 18:05:14 +0100781 if (first)
782 stream << "1 0.0";
783 stream << "\n";
Marc Kupietz3400aa52018-06-05 10:28:55 +0200784 std::cout << stream.str();
785 }
786
Marc Kupietz4b799e92018-01-02 11:04:56 +0100787 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
Marc Kupietz12af0192021-03-13 18:05:14 +0100788
Marc Kupietz4b799e92018-01-02 11:04:56 +0100789 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
Marc Kupietz12af0192021-03-13 18:05:14 +0100790
Marc Kupietz4b799e92018-01-02 11:04:56 +0100791 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
792
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100793};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100794
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200795string rocksdb::CollocatorDB::getWord(uint32_t w1) {
796 return _vocab[w1].word;
797}
798
Marc Kupietz979580e2024-11-21 18:05:07 +0100799uint64_t rocksdb::CollocatorDB::getWordId(const char *word) const {
800 for (uint64_t i = 0; i < _vocab.size(); i++) {
801 if (strcmp(_vocab[i].word.c_str(), word) == 0)
802 return i;
803 }
804 return 0;
805}
806
Marc Kupietze9627152019-02-04 12:32:12 +0100807string rocksdb::CollocatorDB::collocators2json(uint32_t w1, vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100808 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100809 int i = 0;
Marc Kupietze9627152019-02-04 12:32:12 +0100810 s << " { \"f1\": " << _vocab[w1].freq << "," <<
811 "\"w1\":\"" << string(_vocab[w1].word) << "\", " <<
812 "\"N\": " << total << ", " <<
813 "\"collocates\": [";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100814 bool first = true;
815 for (Collocator c : collocators) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100816 if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100817 if (i++ > 200)
818 break;
Marc Kupietz12af0192021-03-13 18:05:14 +0100819 if (!first)
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100820 s << ",\n";
821 else
822 first = false;
823 s << "{"
Marc Kupietz12af0192021-03-13 18:05:14 +0100824 "\"word\":\"" << (string(_vocab[c.w2].word).compare("<num>") == 0 ? string("###") : string(_vocab[c.w2].word))
825 << "\"," <<
826 "\"f2\":" << c.f2 << "," <<
827 "\"f\":" << c.raw << "," <<
828 "\"npmi\":" << c.npmi << "," <<
829 "\"pmi\":" << c.pmi << "," <<
830 "\"llr\":" << c.llr << "," <<
831 "\"lfmd\":" << c.lfmd << "," <<
832 "\"md\":" << c.md << "," <<
833 "\"dice\":" << c.dice << "," <<
834 "\"ld\":" << c.logdice << "," <<
Marc Kupietz97f433b2021-03-13 18:10:52 +0100835 "\"ln_count\":" << c.left_raw << "," <<
836 "\"rn_count\":" << c.right_raw << "," <<
837 "\"ln_pmi\":" << c.left_pmi << "," <<
838 "\"rn_pmi\":" << c.right_pmi << "," <<
839 "\"ldaf\":" << c.ldaf << "," <<
Marc Kupietze9f58932019-01-24 15:12:59 +0100840 "\"win\":" << c.window << "," <<
Marc Kupietz12af0192021-03-13 18:05:14 +0100841 "\"afwin\":" << c.af_window <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100842 "}";
843 }
Marc Kupietze9627152019-02-04 12:32:12 +0100844 s << "]}\n";
Marc Kupietz0421d092021-03-13 18:05:14 +0100845 // std::cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100846 return s.str();
847}
848
Marc Kupietz6aec7682018-01-10 09:47:48 +0100849typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100850
851extern "C" {
Marc Kupietz12af0192021-03-13 18:05:14 +0100852#ifdef __clang__
853#pragma clang diagnostic push
854#pragma ide diagnostic ignored "OCUnusedGlobalDeclarationInspection"
855#endif
Marc Kupietz44229232024-08-05 15:00:20 +0200856 DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100857 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100858 }
Marc Kupietz12af0192021-03-13 18:05:14 +0100859
Marc Kupietz44229232024-08-05 15:00:20 +0200860 DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100861 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100862 }
863
Marc Kupietz44229232024-08-05 15:00:20 +0200864 DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100865 db->inc(w1, w2, dist);
866 }
867
Marc Kupietz44229232024-08-05 15:00:20 +0200868 DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100869 db->dump(w1, w2, dist);
870 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100871
Marc Kupietz44229232024-08-05 15:00:20 +0200872 DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
Marc Kupietz6663f112021-03-14 09:20:59 +0100873 std::vector<Collocator> c = db->get_collocators(w1);
874 if (c.empty())
875 return NULL;
876 uint64_t size = c.size() + sizeof c[0];
877 COLLOCATORS *p = (COLLOCATORS *) malloc(size);
878 memcpy(p, c.data(), size);
879 return p;
Marc Kupietz12af0192021-03-13 18:05:14 +0100880 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100881
Marc Kupietz44229232024-08-05 15:00:20 +0200882 DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
Marc Kupietz6663f112021-03-14 09:20:59 +0100883 std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
884 if (c.empty())
885 return NULL;
886 uint64_t size = c.size() + sizeof c[0];
887 COLLOCATORS *p = (COLLOCATORS *) malloc(size);
888 memcpy(p, c.data(), size);
889 return p;
Marc Kupietz12af0192021-03-13 18:05:14 +0100890 }
Marc Kupietz88d116b2021-03-13 18:05:14 +0100891
Marc Kupietz44229232024-08-05 15:00:20 +0200892 DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100893 return strdup(db->getWord(w).c_str());
894 }
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200895
Marc Kupietz979580e2024-11-21 18:05:07 +0100896 DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
897 return db->getWordId(word);
898 }
899
Marc Kupietz44229232024-08-05 15:00:20 +0200900 DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100901 std::string fName(fname);
902 db->readVocab(fName);
903 }
904
Marc Kupietz44229232024-08-05 15:00:20 +0200905 DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100906 return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
907 }
Marc Kupietz88d116b2021-03-13 18:05:14 +0100908
Marc Kupietz44229232024-08-05 15:00:20 +0200909 DLL_EXPORT const char *get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
Marc Kupietz12af0192021-03-13 18:05:14 +0100910 return strdup(db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
911 }
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100912
Marc Kupietz6208fd72024-11-15 15:46:19 +0100913 DLL_EXPORT const char *get_version() {
914 return PROJECT_VERSION;
915 }
916
Marc Kupietz12af0192021-03-13 18:05:14 +0100917#ifdef __clang__
918#pragma clang diagnostic push
919#endif
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100920}