blob: eaf0296850e66e388899311f2ce3e9c3affdfe1b [file] [log] [blame]
Marc Kupietz4b799e92018-01-02 11:04:56 +01001#define EXPORT __attribute__((visibility("visible")))
2#define IMPORT
Marc Kupietz12af0192021-03-13 18:05:14 +01003
Marc Kupietz39887082024-11-22 18:06:20 +01004#include "config.h"
5#include "export.h"
6#include "merge_operators.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +01007#include "rocksdb/db.h"
8#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +01009#include "rocksdb/table.h"
Marc Kupietze889cec2024-11-23 12:08:42 +010010#include "rocksdb/slice.h"
Marc Kupietz39887082024-11-22 18:06:20 +010011#include <algorithm>
12#include <cassert>
13#include <cmath>
14#include <cstdint>
15#include <iostream>
16#include <memory>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010017#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include <rocksdb/slice_transform.h>
Marc Kupietz39887082024-11-22 18:06:20 +010019#include <sstream> // for ostringstream
20#include <string>
Marc Kupietzc630c152025-01-23 11:17:47 +010021#include <thread>
Marc Kupietz39887082024-11-22 18:06:20 +010022#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023
Marc Kupietz75af60f2019-01-22 22:34:29 +010024#define WINDOW_SIZE 5
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010025#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010026#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
Marc Kupietz39887082024-11-22 18:06:20 +010027#define encodeCollocation(w1, w2, dist) \
28 (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010029#define W1(key) (uint64_t)(key & 0xffffff)
30#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
31#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010032
33typedef struct {
34 uint64_t freq;
35 char *word;
Marc Kupietz12af0192021-03-13 18:05:14 +010036} vocab_entry;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010037
38// typedef struct Collocator {
39// uint64_t w2;
40// uint64_t sum;
41// };
42
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010044using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010045
Marc Kupietz4b799e92018-01-02 11:04:56 +010046namespace rocksdb {
Marc Kupietz39887082024-11-22 18:06:20 +010047class Collocator {
48public:
49 uint32_t w2;
50 uint64_t f2;
51 uint64_t raw;
52 double pmi;
53 double npmi;
54 double llr;
55 double lfmd;
56 double md;
Marc Kupietze889cec2024-11-23 12:08:42 +010057 double md_nws;
Marc Kupietz39887082024-11-22 18:06:20 +010058 uint64_t left_raw;
59 uint64_t right_raw;
60 double left_pmi;
61 double right_pmi;
62 double dice;
63 double logdice;
64 double ldaf;
65 int window;
66 int af_window;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010067};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010068
Marc Kupietz39887082024-11-22 18:06:20 +010069size_t num_merge_operator_calls;
70
71void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
72
73size_t num_partial_merge_calls;
74
75void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
76
77inline void EncodeFixed64(char *buf, uint64_t value) {
78 if (!IS_BIG_ENDIAN) {
79 memcpy(buf, &value, sizeof(value));
80 } else {
81 buf[0] = value & 0xff;
82 buf[1] = (value >> 8) & 0xff;
83 buf[2] = (value >> 16) & 0xff;
84 buf[3] = (value >> 24) & 0xff;
85 buf[4] = (value >> 32) & 0xff;
86 buf[5] = (value >> 40) & 0xff;
87 buf[6] = (value >> 48) & 0xff;
88 buf[7] = (value >> 56) & 0xff;
89 }
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020090}
91
Marc Kupietz39887082024-11-22 18:06:20 +010092inline uint32_t DecodeFixed32(const char *ptr) {
93 if (!IS_BIG_ENDIAN) {
94 // Load the raw bytes
95 uint32_t result;
96 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
97 return result;
98 } else {
99 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
100 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
101 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
102 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
103 }
104}
105
106inline uint64_t DecodeFixed64(const char *ptr) {
107 if (!IS_BIG_ENDIAN) {
108 // Load the raw bytes
109 uint64_t result;
110 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
111 return result;
112 } else {
113 uint64_t lo = DecodeFixed32(ptr);
114 uint64_t hi = DecodeFixed32(ptr + 4);
115 return (hi << 32) | lo;
116 }
117}
118
119static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12,
120 uint64_t total, double window_size) {
121 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
122 if (f12 < FREQUENCY_THRESHOLD)
123 return -1.0;
124 else
125 return log2(o / e);
126}
127
128// Bouma, Gerlof (2009): <a
129// href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
130// Normalized (pointwise) mutual information in collocation extraction</a>. In
131// Proceedings of GSCL.
132static double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12,
133 uint64_t total, double window_size) {
134 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
135 if (f12 < FREQUENCY_THRESHOLD)
136 return -1.0;
137 else
138 return log2(o / e) / (-log2(o / total / window_size));
139}
140
141// Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of
142// collocation extraction metrics. In: International Conference on Language
143// Resources and Evaluation (LREC-2002). (2002) 620–625 double md =
144// log2(pow((double)max * window_size / total, 2) / (window_size *
145// ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
146static double ca_md(uint64_t f1, uint64_t f2, uint64_t f12,
147 uint64_t total, double window_size) {
148 const double r1 = f1 * window_size;
149 const double c1 = f2;
150 const double e = r1 * c1 / total;
151 const double o = f12;
152 return log2(o * o / e);
153}
154
155static double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12,
156 uint64_t total, double window_size) {
157 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
158 if (f12 == 0)
159 return 0;
160 return log2(o * o * o / e);
161}
162
163// Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and
164// Collocations. PhD dissertation, IMS, University of Stuttgart. Published in
165// 2005, URN urn:nbn:de:bsz:93-opus-23714. Free PDF available from
166// http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
167static double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
168 uint64_t window_size) {
169 double r1 = (double)w1 * window_size, r2 = (double)n - r1, c1 = w2,
170 c2 = n - c1, o11 = w12, o12 = r1 - o11, o21 = c1 - w12, o22 = r2 - o21,
171 e11 = r1 * c1 / n, e12 = r1 * c2 / n, e21 = r2 * c1 / n,
172 e22 = r2 * c2 / n;
173 return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) +
174 (o12 > 0 ? o12 * log(o12 / e12) : 0) +
175 (o21 > 0 ? o21 * log(o21 / e21) : 0) +
176 (o22 > 0 ? o22 * log(o22 / e22) : 0)));
177}
178
179static double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
180 uint64_t window_size) {
181 double r1 = (double)w1 * window_size, c1 = w2;
182 return 2 * w12 / (c1 + r1);
183}
184
185// Rychlý, Pavel (2008): <a
186// href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A
187// lexicographer-friendly association score.</a> In Proceedings of Recent
188// Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
189static double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12,
190 uint64_t n, uint64_t window_size) {
191 double r1 = (double)w1 * window_size, c1 = w2;
192 return 14 + log2(2 * w12 / (c1 + r1));
193}
194
195class CountMergeOperator : public AssociativeMergeOperator {
196public:
197 CountMergeOperator() {
198 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
199 }
200
201 bool Merge(const Slice &key, const Slice *existing_value,
202 const Slice &value, std::string *new_value,
203 Logger *logger) const override {
204 assert(new_value->empty());
205 ++num_merge_operator_calls;
206 if (existing_value == nullptr) {
207 new_value->assign(value.data(), value.size());
208 return true;
209 }
210
211 return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
212 logger);
213 }
214
215 const char *Name() const override { return "UInt64AddOperator"; }
216
217private:
218 std::shared_ptr<MergeOperator> mergeOperator_;
219};
220
221class CollocatorIterator : public Iterator {
222 char prefixc[sizeof(uint64_t)]{};
223 Iterator *base_iterator_;
224
225public:
226 explicit CollocatorIterator(Iterator *base_iterator) : base_iterator_(base_iterator) {}
227
228 void setPrefix(char *prefix) { memcpy(prefixc, prefix, sizeof(uint64_t)); }
229
230 void SeekToFirst() override { base_iterator_->SeekToFirst(); }
231
232 void SeekToLast() override { base_iterator_->SeekToLast(); }
233
234 void Seek(const rocksdb::Slice &s) override { base_iterator_->Seek(s); }
235
236 void SeekForPrev(const rocksdb::Slice &s) override {
237 base_iterator_->SeekForPrev(s);
238 }
239
240 void Prev() override { base_iterator_->Prev(); }
241
242 void Next() override { base_iterator_->Next(); }
243
244 Slice key() const override;
245
246 Slice value() const override;
247
248 Status status() const override;
249
250 bool Valid() const override;
251
252 bool isValid();
253
254 uint64_t intValue();
255
256 uint64_t intKey();
257};
258
259// rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
260
261bool CollocatorIterator::Valid() const {
262 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
263}
264
265bool CollocatorIterator::isValid() {
266 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
267 // return key().starts_with(std::string(prefixc,3));
268}
269
270uint64_t CollocatorIterator::intKey() {
271 return DecodeFixed64(base_iterator_->key().data());
272}
273
274uint64_t CollocatorIterator::intValue() {
275 return DecodeFixed64(base_iterator_->value().data());
276}
277
278class VocabEntry {
279public:
280 string word;
281 uint64_t freq;
282};
283
284class CollocatorDB {
285 WriteOptions merge_option_; // for merge
286 char _one[sizeof(uint64_t)]{};
287 Slice _one_slice;
288 vector<VocabEntry> _vocab;
289 uint64_t total = 0;
290 uint64_t sentences = 0;
291 float avg_window_size = 8.0;
292
293protected:
294 std::shared_ptr<DB> db_;
295
296 WriteOptions put_option_;
297 ReadOptions get_option_;
298 WriteOptions delete_option_;
299
300 uint64_t default_{};
301
302 std::shared_ptr<DB> OpenDb(const char *dbname);
303
304 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
305
306public:
307 virtual ~CollocatorDB() = default;
308 void readVocab(const string& fname);
309 string getWord(uint32_t w1);
310
311 uint64_t getWordId(const char *word) const;
312
Marc Kupietzd26b1052024-12-10 16:56:39 +0100313 uint64_t getCorpusSize() const;
314
Marc Kupietz21b964c2024-12-10 17:10:50 +0100315 uint64_t getWordFrequency(uint64_t w1);
316
Marc Kupietz39887082024-11-22 18:06:20 +0100317 CollocatorDB(const char *db_name, bool read_only);
318
319 // public interface of CollocatorDB.
320 // All four functions return false
321 // if the underlying level db operation failed.
322
323 // mapped to a levedb Put
324 bool set(const std::string &key, uint64_t value) {
325 // just treat the internal rep of int64 as the string
326 char buf[sizeof(value)];
327 EncodeFixed64(buf, value);
328 Slice slice(buf, sizeof(value));
329 auto s = db_->Put(put_option_, key, slice);
330
331 if (s.ok()) {
332 return true;
333 } else {
334 std::cerr << s.ToString() << std::endl;
335 return false;
336 }
337 }
338
339 DB *getDb() { return db_.get(); }
340
341 // mapped to a rocksdb Delete
342 bool remove(const std::string &key) {
343 auto s = db_->Delete(delete_option_, key);
344
345 if (s.ok()) {
346 return true;
347 } else {
348 std::cerr << s.ToString() << std::endl;
349 return false;
350 }
351 }
352
353 // mapped to a rocksdb Get
354 bool get(const std::string &key, uint64_t *value) {
355 std::string str;
356 auto s = db_->Get(get_option_, key, &str);
357
358 if (s.IsNotFound()) {
359 // return default value if not found;
360 *value = default_;
361 return true;
362 } else if (s.ok()) {
363 // deserialization
364 if (str.size() != sizeof(uint64_t)) {
365 std::cerr << "value corruption\n";
366 return false;
367 }
368 *value = DecodeFixed64(&str[0]);
369 return true;
370 } else {
371 std::cerr << s.ToString() << std::endl;
372 return false;
373 }
374 }
375
376 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
377 char encoded_key[sizeof(uint64_t)];
378 EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
379 uint64_t value = default_;
380 get(std::string(encoded_key, 8), &value);
381 return value;
382 }
383
384 virtual void inc(const std::string &key) {
385 db_->Merge(merge_option_, key, _one_slice);
386 }
387
388 void inc(const uint64_t key) {
389 char encoded_key[sizeof(uint64_t)];
390 EncodeFixed64(encoded_key, key);
391 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
392 }
393
394 virtual void inc(uint32_t w1, uint32_t w2, uint8_t dist);
395
396 void dump(uint32_t w1, uint32_t w2, int8_t dist) const;
397
398 vector<Collocator> get_collocators(uint32_t w1);
399
400 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
401
402 vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
403
404 vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2,
405 uint32_t max_w2);
406
407 void applyCAMeasures(uint32_t w1, uint32_t w2,
408 uint64_t *sumWindow, uint64_t sum,
409 int usedPositions, int true_window_size,
410 Collocator *result) const;
411
412 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
413
414 string collocators2json(uint32_t w1, const vector<Collocator>& collocators);
415
416 // mapped to a rocksdb Merge operation
417 virtual bool add(const std::string &key, uint64_t value) {
418 char encoded[sizeof(uint64_t)];
419 EncodeFixed64(encoded, value);
420 Slice slice(encoded, sizeof(uint64_t));
421 auto s = db_->Merge(merge_option_, key, slice);
422
423 if (s.ok()) {
424 return true;
425 } else {
426 std::cerr << s.ToString() << std::endl;
427 return false;
428 }
429 }
430
431 CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const;
432};
433
434CollocatorDB::CollocatorDB(const char *db_name,
435 bool read_only = false) {
436 // merge_option_.sync = true;
437 if (read_only)
438 db_ = OpenDbForRead(strdup(db_name));
439 else
440 db_ = OpenDb(db_name);
441 assert(db_);
442 uint64_t one = 1;
443 EncodeFixed64(_one, one);
444 _one_slice = Slice(_one, sizeof(uint64_t));
445}
446
447void CollocatorDB::inc(const uint32_t w1, const uint32_t w2,
448 const uint8_t dist) {
449 inc(encodeCollocation(w1, w2, dist));
450}
451
452void CollocatorDB::readVocab(const string& fname) {
453 char strbuf[2048];
454 uint64_t freq;
455 FILE *fin = fopen(fname.c_str(), "rb");
456 if (fin == nullptr) {
457 cout << "Vocabulary file " << fname << " not found\n";
458 exit(1);
459 }
460 uint64_t i = 0;
461 while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
462 _vocab.push_back({strbuf, freq});
463 total += freq;
464 i++;
465 }
466 fclose(fin);
467
468 char size_fname[256];
469 strcpy(size_fname, fname.c_str());
470 char *pos = strstr(size_fname, ".vocab");
471 if (pos) {
472 *pos = 0;
473 strcat(size_fname, ".size");
474 FILE *fp = fopen(size_fname, "r");
475 if (fp != nullptr) {
476 fscanf(fp, "%lu", &sentences);
477 fscanf(fp, "%lu", &total);
478 float sl = (float)total / (float)sentences;
479 float w = WINDOW_SIZE;
480 avg_window_size =
481 ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double)w * (3 * w - 1)) /
482 sl;
483 fprintf(stdout,
484 "Size corrections found: corpus size: %lu tokens in %lu "
485 "sentences, avg. sentence size: %f, avg. window size: %f\n",
486 total, sentences, sl, avg_window_size);
487 fclose(fp);
488 } else {
489 // std::cout << "size file " << size_fname << " not found\n";
490 }
491 } else {
492 std::cout << "cannot determine size file " << size_fname << "\n";
493 }
494}
495
496std::shared_ptr<DB> CollocatorDB::OpenDbForRead(const char *name) {
497 DB *db;
498 Options options;
499 options.env->SetBackgroundThreads(4);
500 options.create_if_missing = true;
501 options.merge_operator = std::make_shared<CountMergeOperator>();
502 options.max_successive_merges = 0;
503 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
504 options.IncreaseParallelism();
505 options.OptimizeLevelStyleCompaction();
506 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
507 ostringstream dbname, vocabname;
508 dbname << name << ".rocksdb";
509 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
510 if (!s.ok()) {
511 std::cerr << s.ToString() << std::endl;
512 assert(false);
513 }
514 vocabname << name << ".vocab";
515 readVocab(vocabname.str());
516 return std::shared_ptr<DB>(db);
517}
518
Marc Kupietzc630c152025-01-23 11:17:47 +0100519 std::shared_ptr<DB> CollocatorDB::OpenDb(const char *dbname) {
520 DB *db;
521 Options options;
Marc Kupietz39887082024-11-22 18:06:20 +0100522
Marc Kupietzc630c152025-01-23 11:17:47 +0100523 int max_cores = static_cast<int>(std::thread::hardware_concurrency());
524
525 // options.env->SetBackgroundThreads(32, Env::Priority::HIGH); // Increase background threads for high priority
526 // options.env->SetBackgroundThreads(16, Env::Priority::LOW); // Increase background threads for low priority
527 options.create_if_missing = true;
528 options.merge_operator = std::make_shared<CountMergeOperator>();
Marc Kupietz0fde12e2025-01-23 12:32:32 +0100529 options.max_background_jobs = max_cores * 2;
530 options.soft_pending_compaction_bytes_limit = 64 * 1024 * 1024; // 64MB
531 options.hard_pending_compaction_bytes_limit = 128 * 1024 * 1024; // 128MB
532
Marc Kupietzc630c152025-01-23 11:17:47 +0100533 //options.max_successive_merges = 0;
534 // options.IncreaseParallelism(max_cores); // Utilize all available cores
535 // options.OptimizeLevelStyleCompaction();
536
537 // Increase write buffer size and number of write buffers
538 // options.write_buffer_size = 512 * 1024 * 1024; // 512MB
539 // options.max_write_buffer_number = max_cores;
540 // options.min_write_buffer_number_to_merge = max_cores / 2;
541
542 // Enable concurrent memtable writes
543 options.allow_concurrent_memtable_write = true;
544 options.enable_write_thread_adaptive_yield = true;
545 options.allow_mmap_writes = true;
546 options.allow_mmap_reads = true;
547
548 // Optimize block cache size
549 BlockBasedTableOptions table_options;
550 table_options.block_cache = NewLRUCache(8 * 1024 * 1024 * 1024L); // 8GB block cache
551 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
552
553 // Adjust compaction settings
554 options.level0_file_num_compaction_trigger = 100;
555 options.level0_slowdown_writes_trigger = 200;
556 options.level0_stop_writes_trigger = 400;
557 options.max_background_compactions = max_cores / 2;
558 options.max_background_flushes = max_cores / 4;
559 // options.disableWA
560 // Tune write options
561 merge_option_.low_pri = true; // Use low priority for compactions
562 merge_option_.disableWAL = true; // Disable Write-Ahead Logging for faster writes
563 merge_option_.sync = false; // Disable sync for faster writes
564 merge_option_.no_slowdown = true; // Disable write slowdown for faster writes
565
566 Status s = DB::Open(options, dbname, &db);
567 if (!s.ok()) {
568 std::cerr << s.ToString() << std::endl;
569 assert(false);
570 }
571 total = 1000;
572 return std::shared_ptr<DB>(db);
Marc Kupietz39887082024-11-22 18:06:20 +0100573 }
Marc Kupietz39887082024-11-22 18:06:20 +0100574
575CollocatorIterator *
576CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const {
577 ReadOptions options;
578 options.prefix_same_as_start = true;
579 char prefixc[sizeof(uint64_t)];
580 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
581 Iterator *it = db_->NewIterator(options);
582 auto *cit = new CollocatorIterator(it);
583 if (w2 > 0)
584 cit->Seek(std::string(prefixc, 6));
585 else
586 cit->Seek(std::string(prefixc, 3));
587 cit->setPrefix(prefixc);
588 return cit;
589}
590
591void CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) const {
592 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
593 for (; it->isValid(); it->Next()) {
594 uint64_t value = it->intValue();
595 uint64_t key = it->intKey();
596 std::cout << "w1:" << W1(key) << ", w2:" << W2(key)
597 << ", dist:" << (int32_t)DIST(key) << " - count:" << value
598 << std::endl;
599 }
600 std::cout << "ready dumping\n";
601}
602
603bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) {
604 return lhs.npmi > rhs.npmi;
605}
606
607bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) {
608 return lhs.lfmd > rhs.lfmd;
609}
610
611bool sortByLlr(const Collocator &lhs, const Collocator &rhs) {
612 return lhs.llr > rhs.llr;
613}
614
615bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) {
616 return lhs.logdice > rhs.logdice;
617}
618
619bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) {
620 return lhs.ldaf > rhs.ldaf;
621}
622
623void CollocatorDB::applyCAMeasures(
624 const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
625 const uint64_t sum, const int usedPositions, int true_window_size,
626 Collocator *result) const {
627 uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
628 double o = sum, r1 = f1 * true_window_size, c1 = f2, e = r1 * c1 / total,
629 pmi = log2(o / e), md = log2(o * o / e), lfmd = log2(o * o * o / e),
Marc Kupietze889cec2024-11-23 12:08:42 +0100630 llr = ca_ll(f1, f2, sum, total, true_window_size),
631 md_nws = ca_md(f1, f2, sum, total, 2 * WINDOW_SIZE),
632 ld = ca_logdice(f1, f2, sum, total, true_window_size);
Marc Kupietz39887082024-11-22 18:06:20 +0100633
634 int bestWindow = usedPositions;
635 double bestAF = ld;
636 // if(f1<75000000)
637 // #pragma omp parallel for reduction(max:bestAF)
638 // #pragma omp target teams distribute parallel for reduction(max:bestAF)
639 // map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
640 for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
641 if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0)
642 continue;
643 uint64_t currentWindowSum = 0;
644 // #pragma omp target teams distribute parallel for
645 // reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
646 for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
647 if (((1 << pos) & bitmask & usedPositions) != 0)
648 currentWindowSum += sumWindow[pos];
649 }
650 double currentAF = ca_logdice(f1, f2, currentWindowSum, total,
651 __builtin_popcount(bitmask));
652 if (currentAF > bestAF) {
653 bestAF = currentAF;
654 bestWindow = bitmask;
655 }
656 }
657
658 *result = {w2,
659 f2,
660 sum,
661 pmi,
662 pmi / (-log2(o / total / true_window_size)),
663 llr,
664 lfmd,
665 md,
Marc Kupietze889cec2024-11-23 12:08:42 +0100666 md_nws,
Marc Kupietz39887082024-11-22 18:06:20 +0100667 sumWindow[WINDOW_SIZE],
668 sumWindow[WINDOW_SIZE - 1],
669 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
670 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
671 ca_dice(f1, f2, sum, total, true_window_size),
672 ld,
673 bestAF,
674 usedPositions,
675 bestWindow};
676}
677
678std::vector<Collocator>
679CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2,
680 uint32_t max_w2) {
681 std::vector<Collocator> collocators;
682 uint64_t w2, last_w2 = 0xffffffffffffffff;
683 uint64_t maxv = 0, sum = 0;
684 auto *sumWindow =
685 static_cast<uint64_t *>(malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE));
686 memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
687 int true_window_size = 1;
688 int usedPositions = 0;
689
690 if (w1 > _vocab.size()) {
691 std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
692 w1 -= _vocab.size();
693 }
694#ifdef DEBUG
695 std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
696#endif
697 // #pragma omp parallel num_threads(40)
698 // #pragma omp single
699 for (auto it =
700 std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0));
701 it->isValid(); it->Next()) {
702 uint64_t value = it->intValue(), key = it->intKey();
703 if ((w2 = W2(key)) > max_w2)
704 continue;
705 if (last_w2 == 0xffffffffffffffff)
706 last_w2 = w2;
707 if (w2 != last_w2) {
708 if (sum >= FREQUENCY_THRESHOLD) {
709 collocators.push_back({});
710 Collocator *result = &(collocators[collocators.size() - 1]);
711 // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions,
712 // true_window_size) shared(w1, result) if(sum > 1000000)
713 {
714 // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2
715 // *WINDOW_SIZE); memcpy(nsw, sumWindow, sizeof(uint64_t) * 2
716 // *WINDOW_SIZE);
717 applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions,
718 true_window_size, result);
719 // free(nsw);
720 }
721 }
722 memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
723 usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
724 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
725 last_w2 = w2;
726 maxv = value;
727 sum = value;
728 true_window_size = 1;
729 if (min_w2 == max_w2 && w2 != min_w2)
730 break;
731 } else {
732 sum += value;
733 if (value > maxv)
734 maxv = value;
735 usedPositions |=
736 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
737 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
738 true_window_size++;
739 }
740 }
741
742 // #pragma omp taskwait
743 sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
744
745#ifdef DEBUG
746 int i = 0;
747 for (Collocator c : collocators) {
748 if (i++ > 10)
749 break;
750 std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word
751 << "*"
752 << "\t f(w1):" << _vocab[w1].freq
753 << "\t f(w2):" << _vocab[c.w2].freq << "\t f(w1, w2):" << c.raw
754 << "\t pmi:" << c.pmi << "\t npmi:" << c.npmi
755 << "\t llr:" << c.llr << "\t md:" << c.md << "\t lfmd:" << c.lfmd
756 << "\t total:" << total << std::endl;
757 }
758#endif
759
760 return collocators;
761}
762
763std::vector<Collocator>
764CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
765 return get_collocators(w1, w2, w2);
766}
767
768std::vector<Collocator> CollocatorDB::get_collocators(uint32_t w1) {
769 return get_collocators(w1, 0, UINT32_MAX);
770}
771
772void CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
773 std::vector<Collocator> collocators;
774 std::stringstream stream;
775 uint64_t w2, last_w2 = 0xffffffffffffffff;
776 uint64_t maxv = 0, total_w1 = 0;
777 bool first = true;
778 for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0));
779 it->isValid(); it->Next()) {
780 uint64_t value = it->intValue(), key = it->intKey();
781 w2 = W2(key);
782 total_w1 += value;
783 if (last_w2 == 0xffffffffffffffff)
784 last_w2 = w2;
785 if (w2 != last_w2) {
786 if (maxv >= min_cooccur) {
787 double llr =
788 ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
789 if (first)
790 first = false;
791 else
792 stream << " ";
793 stream << w2 << " " << llr;
794 }
795 last_w2 = w2;
796 maxv = value;
797 } else {
798 if (value > maxv)
799 maxv = value;
800 }
801 }
802 if (first)
803 stream << "1 0.0";
804 stream << "\n";
805 std::cout << stream.str();
806}
807
808Slice CollocatorIterator::key() const {
809 return base_iterator_->key();
810}
811
812Slice CollocatorIterator::value() const {
813 return base_iterator_->value();
814}
815
816Status CollocatorIterator::status() const {
817 return base_iterator_->status();
818}
819
820}; // namespace rocksdb
821
822string CollocatorDB::getWord(uint32_t w1) { return _vocab[w1].word; }
823
824uint64_t CollocatorDB::getWordId(const char *word) const {
Marc Kupietz979580e2024-11-21 18:05:07 +0100825 for (uint64_t i = 0; i < _vocab.size(); i++) {
826 if (strcmp(_vocab[i].word.c_str(), word) == 0)
827 return i;
828 }
829 return 0;
830}
831
Marc Kupietzd26b1052024-12-10 16:56:39 +0100832uint64_t CollocatorDB::getCorpusSize() const {
833 return total;
834}
835
Marc Kupietz21b964c2024-12-10 17:10:50 +0100836uint64_t CollocatorDB::getWordFrequency(uint64_t w1) {
837 return _vocab[w1].freq;
838}
839
Marc Kupietz39887082024-11-22 18:06:20 +0100840string CollocatorDB::collocators2json(uint32_t w1,
841 const vector<Collocator>& collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100842 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100843 int i = 0;
Marc Kupietz39887082024-11-22 18:06:20 +0100844 s << " { \"f1\": " << _vocab[w1].freq << "," << R"("w1":")"
845 << string(_vocab[w1].word) << "\", " << "\"N\": " << total << ", "
846 << "\"collocates\": [";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100847 bool first = true;
848 for (Collocator c : collocators) {
Marc Kupietz39887082024-11-22 18:06:20 +0100849 if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0)
850 continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100851 if (i++ > 200)
852 break;
Marc Kupietz12af0192021-03-13 18:05:14 +0100853 if (!first)
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100854 s << ",\n";
855 else
856 first = false;
857 s << "{"
Marc Kupietz39887082024-11-22 18:06:20 +0100858 "\"word\":\""
859 << (string(_vocab[c.w2].word) == "<num>"
860 ? string("###")
861 : string(_vocab[c.w2].word))
862 << "\"," << "\"f2\":" << c.f2 << "," << "\"f\":" << c.raw << ","
863 << "\"npmi\":" << c.npmi << "," << "\"pmi\":" << c.pmi << ","
864 << "\"llr\":" << c.llr << "," << "\"lfmd\":" << c.lfmd << ","
Marc Kupietze889cec2024-11-23 12:08:42 +0100865 << "\"md\":" << c.md << "," << "\"md_nws\":" << c.md_nws << "," << "\"dice\":" << c.dice << ","
Marc Kupietz39887082024-11-22 18:06:20 +0100866 << "\"ld\":" << c.logdice << "," << "\"ln_count\":" << c.left_raw << ","
867 << "\"rn_count\":" << c.right_raw << "," << "\"ln_pmi\":" << c.left_pmi
868 << "," << "\"rn_pmi\":" << c.right_pmi << "," << "\"ldaf\":" << c.ldaf
869 << "," << "\"win\":" << c.window << "," << "\"afwin\":" << c.af_window
870 << "}";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100871 }
Marc Kupietze9627152019-02-04 12:32:12 +0100872 s << "]}\n";
Marc Kupietz0421d092021-03-13 18:05:14 +0100873 // std::cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100874 return s.str();
875}
876
Marc Kupietz39887082024-11-22 18:06:20 +0100877typedef CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100878
879extern "C" {
Marc Kupietz12af0192021-03-13 18:05:14 +0100880#ifdef __clang__
881#pragma clang diagnostic push
882#pragma ide diagnostic ignored "OCUnusedGlobalDeclarationInspection"
883#endif
Marc Kupietz39887082024-11-22 18:06:20 +0100884DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
885 return new CollocatorDB(dbname, false);
886}
Marc Kupietz12af0192021-03-13 18:05:14 +0100887
Marc Kupietz39887082024-11-22 18:06:20 +0100888DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
889 return new CollocatorDB(dbname, true);
890}
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100891
Marc Kupietz39887082024-11-22 18:06:20 +0100892DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2,
893 int8_t dist) {
894 db->inc(w1, w2, dist);
895}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100896
Marc Kupietz39887082024-11-22 18:06:20 +0100897DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2,
898 int8_t dist) {
899 db->dump(w1, w2, dist);
900}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100901
Marc Kupietz39887082024-11-22 18:06:20 +0100902DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
903 std::vector<Collocator> c = db->get_collocators(w1);
904 if (c.empty())
905 return nullptr;
906 uint64_t size = c.size() + sizeof c[0];
907 auto *p = (COLLOCATORS *)malloc(size);
908 memcpy(p, c.data(), size);
909 return p;
910}
Marc Kupietz88d116b2021-03-13 18:05:14 +0100911
Marc Kupietz39887082024-11-22 18:06:20 +0100912DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1,
913 uint32_t w2) {
914 std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
915 if (c.empty())
916 return nullptr;
917 uint64_t size = c.size() + sizeof c[0];
918 auto *p = (COLLOCATORS *)malloc(size);
919 memcpy(p, c.data(), size);
920 return p;
921}
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200922
Marc Kupietz39887082024-11-22 18:06:20 +0100923DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
924 return strdup(db->getWord(w).c_str());
925}
Marc Kupietz979580e2024-11-21 18:05:07 +0100926
Marc Kupietz39887082024-11-22 18:06:20 +0100927DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
928 return db->getWordId(word);
929}
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100930
Marc Kupietz39887082024-11-22 18:06:20 +0100931DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
932 std::string fName(fname);
933 db->readVocab(fName);
934}
Marc Kupietz88d116b2021-03-13 18:05:14 +0100935
Marc Kupietz39887082024-11-22 18:06:20 +0100936DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
937 return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
938}
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100939
Marc Kupietz39887082024-11-22 18:06:20 +0100940DLL_EXPORT const char *
941get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
942 return strdup(
943 db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
944}
945
946DLL_EXPORT const char *get_version() { return PROJECT_VERSION; }
Marc Kupietz6208fd72024-11-15 15:46:19 +0100947
Marc Kupietzd26b1052024-12-10 16:56:39 +0100948DLL_EXPORT uint64_t get_corpus_size(COLLOCATORS *db) { return db->getCorpusSize(); };
949
Marc Kupietz21b964c2024-12-10 17:10:50 +0100950DLL_EXPORT uint64_t get_word_frequency(COLLOCATORS *db, uint64_t w1) {
951 return db->getWordFrequency(w1);
952}
953
Marc Kupietz12af0192021-03-13 18:05:14 +0100954#ifdef __clang__
955#pragma clang diagnostic push
956#endif
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100957}