blob: 87d9dbd09db5bd5b44711a9e9902eed958833125 [file] [log] [blame]
Marc Kupietz4b799e92018-01-02 11:04:56 +01001#define EXPORT __attribute__((visibility("visible")))
2#define IMPORT
Marc Kupietz12af0192021-03-13 18:05:14 +01003
Marc Kupietz39887082024-11-22 18:06:20 +01004#include "config.h"
5#include "export.h"
6#include "merge_operators.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +01007#include "rocksdb/db.h"
8#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +01009#include "rocksdb/table.h"
Marc Kupietze889cec2024-11-23 12:08:42 +010010#include "rocksdb/slice.h"
Marc Kupietz39887082024-11-22 18:06:20 +010011#include <algorithm>
12#include <cassert>
13#include <cmath>
14#include <cstdint>
15#include <iostream>
16#include <memory>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010017#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include <rocksdb/slice_transform.h>
Marc Kupietz39887082024-11-22 18:06:20 +010019#include <sstream> // for ostringstream
20#include <string>
21#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010022
Marc Kupietz75af60f2019-01-22 22:34:29 +010023#define WINDOW_SIZE 5
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010024#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010025#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
Marc Kupietz39887082024-11-22 18:06:20 +010026#define encodeCollocation(w1, w2, dist) \
27 (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010028#define W1(key) (uint64_t)(key & 0xffffff)
29#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
30#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010031
32typedef struct {
33 uint64_t freq;
34 char *word;
Marc Kupietz12af0192021-03-13 18:05:14 +010035} vocab_entry;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010036
37// typedef struct Collocator {
38// uint64_t w2;
39// uint64_t sum;
40// };
41
Marc Kupietz28cc53e2017-12-23 17:24:55 +010042using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010043using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010044
Marc Kupietz4b799e92018-01-02 11:04:56 +010045namespace rocksdb {
Marc Kupietz39887082024-11-22 18:06:20 +010046class Collocator {
47public:
48 uint32_t w2;
49 uint64_t f2;
50 uint64_t raw;
51 double pmi;
52 double npmi;
53 double llr;
54 double lfmd;
55 double md;
Marc Kupietze889cec2024-11-23 12:08:42 +010056 double md_nws;
Marc Kupietz39887082024-11-22 18:06:20 +010057 uint64_t left_raw;
58 uint64_t right_raw;
59 double left_pmi;
60 double right_pmi;
61 double dice;
62 double logdice;
63 double ldaf;
64 int window;
65 int af_window;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010066};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010067
Marc Kupietz39887082024-11-22 18:06:20 +010068size_t num_merge_operator_calls;
69
70void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
71
72size_t num_partial_merge_calls;
73
74void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
75
76inline void EncodeFixed64(char *buf, uint64_t value) {
77 if (!IS_BIG_ENDIAN) {
78 memcpy(buf, &value, sizeof(value));
79 } else {
80 buf[0] = value & 0xff;
81 buf[1] = (value >> 8) & 0xff;
82 buf[2] = (value >> 16) & 0xff;
83 buf[3] = (value >> 24) & 0xff;
84 buf[4] = (value >> 32) & 0xff;
85 buf[5] = (value >> 40) & 0xff;
86 buf[6] = (value >> 48) & 0xff;
87 buf[7] = (value >> 56) & 0xff;
88 }
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020089}
90
Marc Kupietz39887082024-11-22 18:06:20 +010091inline uint32_t DecodeFixed32(const char *ptr) {
92 if (!IS_BIG_ENDIAN) {
93 // Load the raw bytes
94 uint32_t result;
95 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
96 return result;
97 } else {
98 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
99 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
100 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
101 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
102 }
103}
104
105inline uint64_t DecodeFixed64(const char *ptr) {
106 if (!IS_BIG_ENDIAN) {
107 // Load the raw bytes
108 uint64_t result;
109 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
110 return result;
111 } else {
112 uint64_t lo = DecodeFixed32(ptr);
113 uint64_t hi = DecodeFixed32(ptr + 4);
114 return (hi << 32) | lo;
115 }
116}
117
118static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12,
119 uint64_t total, double window_size) {
120 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
121 if (f12 < FREQUENCY_THRESHOLD)
122 return -1.0;
123 else
124 return log2(o / e);
125}
126
127// Bouma, Gerlof (2009): <a
128// href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
129// Normalized (pointwise) mutual information in collocation extraction</a>. In
130// Proceedings of GSCL.
131static double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12,
132 uint64_t total, double window_size) {
133 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
134 if (f12 < FREQUENCY_THRESHOLD)
135 return -1.0;
136 else
137 return log2(o / e) / (-log2(o / total / window_size));
138}
139
140// Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of
141// collocation extraction metrics. In: International Conference on Language
142// Resources and Evaluation (LREC-2002). (2002) 620–625 double md =
143// log2(pow((double)max * window_size / total, 2) / (window_size *
144// ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
145static double ca_md(uint64_t f1, uint64_t f2, uint64_t f12,
146 uint64_t total, double window_size) {
147 const double r1 = f1 * window_size;
148 const double c1 = f2;
149 const double e = r1 * c1 / total;
150 const double o = f12;
151 return log2(o * o / e);
152}
153
154static double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12,
155 uint64_t total, double window_size) {
156 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
157 if (f12 == 0)
158 return 0;
159 return log2(o * o * o / e);
160}
161
162// Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and
163// Collocations. PhD dissertation, IMS, University of Stuttgart. Published in
164// 2005, URN urn:nbn:de:bsz:93-opus-23714. Free PDF available from
165// http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
166static double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
167 uint64_t window_size) {
168 double r1 = (double)w1 * window_size, r2 = (double)n - r1, c1 = w2,
169 c2 = n - c1, o11 = w12, o12 = r1 - o11, o21 = c1 - w12, o22 = r2 - o21,
170 e11 = r1 * c1 / n, e12 = r1 * c2 / n, e21 = r2 * c1 / n,
171 e22 = r2 * c2 / n;
172 return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) +
173 (o12 > 0 ? o12 * log(o12 / e12) : 0) +
174 (o21 > 0 ? o21 * log(o21 / e21) : 0) +
175 (o22 > 0 ? o22 * log(o22 / e22) : 0)));
176}
177
178static double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
179 uint64_t window_size) {
180 double r1 = (double)w1 * window_size, c1 = w2;
181 return 2 * w12 / (c1 + r1);
182}
183
184// Rychlý, Pavel (2008): <a
185// href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A
186// lexicographer-friendly association score.</a> In Proceedings of Recent
187// Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
188static double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12,
189 uint64_t n, uint64_t window_size) {
190 double r1 = (double)w1 * window_size, c1 = w2;
191 return 14 + log2(2 * w12 / (c1 + r1));
192}
193
194class CountMergeOperator : public AssociativeMergeOperator {
195public:
196 CountMergeOperator() {
197 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
198 }
199
200 bool Merge(const Slice &key, const Slice *existing_value,
201 const Slice &value, std::string *new_value,
202 Logger *logger) const override {
203 assert(new_value->empty());
204 ++num_merge_operator_calls;
205 if (existing_value == nullptr) {
206 new_value->assign(value.data(), value.size());
207 return true;
208 }
209
210 return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
211 logger);
212 }
213
214 const char *Name() const override { return "UInt64AddOperator"; }
215
216private:
217 std::shared_ptr<MergeOperator> mergeOperator_;
218};
219
220class CollocatorIterator : public Iterator {
221 char prefixc[sizeof(uint64_t)]{};
222 Iterator *base_iterator_;
223
224public:
225 explicit CollocatorIterator(Iterator *base_iterator) : base_iterator_(base_iterator) {}
226
227 void setPrefix(char *prefix) { memcpy(prefixc, prefix, sizeof(uint64_t)); }
228
229 void SeekToFirst() override { base_iterator_->SeekToFirst(); }
230
231 void SeekToLast() override { base_iterator_->SeekToLast(); }
232
233 void Seek(const rocksdb::Slice &s) override { base_iterator_->Seek(s); }
234
235 void SeekForPrev(const rocksdb::Slice &s) override {
236 base_iterator_->SeekForPrev(s);
237 }
238
239 void Prev() override { base_iterator_->Prev(); }
240
241 void Next() override { base_iterator_->Next(); }
242
243 Slice key() const override;
244
245 Slice value() const override;
246
247 Status status() const override;
248
249 bool Valid() const override;
250
251 bool isValid();
252
253 uint64_t intValue();
254
255 uint64_t intKey();
256};
257
258// rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
259
260bool CollocatorIterator::Valid() const {
261 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
262}
263
264bool CollocatorIterator::isValid() {
265 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
266 // return key().starts_with(std::string(prefixc,3));
267}
268
269uint64_t CollocatorIterator::intKey() {
270 return DecodeFixed64(base_iterator_->key().data());
271}
272
273uint64_t CollocatorIterator::intValue() {
274 return DecodeFixed64(base_iterator_->value().data());
275}
276
277class VocabEntry {
278public:
279 string word;
280 uint64_t freq;
281};
282
283class CollocatorDB {
284 WriteOptions merge_option_; // for merge
285 char _one[sizeof(uint64_t)]{};
286 Slice _one_slice;
287 vector<VocabEntry> _vocab;
288 uint64_t total = 0;
289 uint64_t sentences = 0;
290 float avg_window_size = 8.0;
291
292protected:
293 std::shared_ptr<DB> db_;
294
295 WriteOptions put_option_;
296 ReadOptions get_option_;
297 WriteOptions delete_option_;
298
299 uint64_t default_{};
300
301 std::shared_ptr<DB> OpenDb(const char *dbname);
302
303 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
304
305public:
306 virtual ~CollocatorDB() = default;
307 void readVocab(const string& fname);
308 string getWord(uint32_t w1);
309
310 uint64_t getWordId(const char *word) const;
311
312 CollocatorDB(const char *db_name, bool read_only);
313
314 // public interface of CollocatorDB.
315 // All four functions return false
316 // if the underlying level db operation failed.
317
318 // mapped to a levedb Put
319 bool set(const std::string &key, uint64_t value) {
320 // just treat the internal rep of int64 as the string
321 char buf[sizeof(value)];
322 EncodeFixed64(buf, value);
323 Slice slice(buf, sizeof(value));
324 auto s = db_->Put(put_option_, key, slice);
325
326 if (s.ok()) {
327 return true;
328 } else {
329 std::cerr << s.ToString() << std::endl;
330 return false;
331 }
332 }
333
334 DB *getDb() { return db_.get(); }
335
336 // mapped to a rocksdb Delete
337 bool remove(const std::string &key) {
338 auto s = db_->Delete(delete_option_, key);
339
340 if (s.ok()) {
341 return true;
342 } else {
343 std::cerr << s.ToString() << std::endl;
344 return false;
345 }
346 }
347
348 // mapped to a rocksdb Get
349 bool get(const std::string &key, uint64_t *value) {
350 std::string str;
351 auto s = db_->Get(get_option_, key, &str);
352
353 if (s.IsNotFound()) {
354 // return default value if not found;
355 *value = default_;
356 return true;
357 } else if (s.ok()) {
358 // deserialization
359 if (str.size() != sizeof(uint64_t)) {
360 std::cerr << "value corruption\n";
361 return false;
362 }
363 *value = DecodeFixed64(&str[0]);
364 return true;
365 } else {
366 std::cerr << s.ToString() << std::endl;
367 return false;
368 }
369 }
370
371 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
372 char encoded_key[sizeof(uint64_t)];
373 EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
374 uint64_t value = default_;
375 get(std::string(encoded_key, 8), &value);
376 return value;
377 }
378
379 virtual void inc(const std::string &key) {
380 db_->Merge(merge_option_, key, _one_slice);
381 }
382
383 void inc(const uint64_t key) {
384 char encoded_key[sizeof(uint64_t)];
385 EncodeFixed64(encoded_key, key);
386 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
387 }
388
389 virtual void inc(uint32_t w1, uint32_t w2, uint8_t dist);
390
391 void dump(uint32_t w1, uint32_t w2, int8_t dist) const;
392
393 vector<Collocator> get_collocators(uint32_t w1);
394
395 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
396
397 vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
398
399 vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2,
400 uint32_t max_w2);
401
402 void applyCAMeasures(uint32_t w1, uint32_t w2,
403 uint64_t *sumWindow, uint64_t sum,
404 int usedPositions, int true_window_size,
405 Collocator *result) const;
406
407 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
408
409 string collocators2json(uint32_t w1, const vector<Collocator>& collocators);
410
411 // mapped to a rocksdb Merge operation
412 virtual bool add(const std::string &key, uint64_t value) {
413 char encoded[sizeof(uint64_t)];
414 EncodeFixed64(encoded, value);
415 Slice slice(encoded, sizeof(uint64_t));
416 auto s = db_->Merge(merge_option_, key, slice);
417
418 if (s.ok()) {
419 return true;
420 } else {
421 std::cerr << s.ToString() << std::endl;
422 return false;
423 }
424 }
425
426 CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const;
427};
428
429CollocatorDB::CollocatorDB(const char *db_name,
430 bool read_only = false) {
431 // merge_option_.sync = true;
432 if (read_only)
433 db_ = OpenDbForRead(strdup(db_name));
434 else
435 db_ = OpenDb(db_name);
436 assert(db_);
437 uint64_t one = 1;
438 EncodeFixed64(_one, one);
439 _one_slice = Slice(_one, sizeof(uint64_t));
440}
441
442void CollocatorDB::inc(const uint32_t w1, const uint32_t w2,
443 const uint8_t dist) {
444 inc(encodeCollocation(w1, w2, dist));
445}
446
447void CollocatorDB::readVocab(const string& fname) {
448 char strbuf[2048];
449 uint64_t freq;
450 FILE *fin = fopen(fname.c_str(), "rb");
451 if (fin == nullptr) {
452 cout << "Vocabulary file " << fname << " not found\n";
453 exit(1);
454 }
455 uint64_t i = 0;
456 while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
457 _vocab.push_back({strbuf, freq});
458 total += freq;
459 i++;
460 }
461 fclose(fin);
462
463 char size_fname[256];
464 strcpy(size_fname, fname.c_str());
465 char *pos = strstr(size_fname, ".vocab");
466 if (pos) {
467 *pos = 0;
468 strcat(size_fname, ".size");
469 FILE *fp = fopen(size_fname, "r");
470 if (fp != nullptr) {
471 fscanf(fp, "%lu", &sentences);
472 fscanf(fp, "%lu", &total);
473 float sl = (float)total / (float)sentences;
474 float w = WINDOW_SIZE;
475 avg_window_size =
476 ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double)w * (3 * w - 1)) /
477 sl;
478 fprintf(stdout,
479 "Size corrections found: corpus size: %lu tokens in %lu "
480 "sentences, avg. sentence size: %f, avg. window size: %f\n",
481 total, sentences, sl, avg_window_size);
482 fclose(fp);
483 } else {
484 // std::cout << "size file " << size_fname << " not found\n";
485 }
486 } else {
487 std::cout << "cannot determine size file " << size_fname << "\n";
488 }
489}
490
491std::shared_ptr<DB> CollocatorDB::OpenDbForRead(const char *name) {
492 DB *db;
493 Options options;
494 options.env->SetBackgroundThreads(4);
495 options.create_if_missing = true;
496 options.merge_operator = std::make_shared<CountMergeOperator>();
497 options.max_successive_merges = 0;
498 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
499 options.IncreaseParallelism();
500 options.OptimizeLevelStyleCompaction();
501 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
502 ostringstream dbname, vocabname;
503 dbname << name << ".rocksdb";
504 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
505 if (!s.ok()) {
506 std::cerr << s.ToString() << std::endl;
507 assert(false);
508 }
509 vocabname << name << ".vocab";
510 readVocab(vocabname.str());
511 return std::shared_ptr<DB>(db);
512}
513
514std::shared_ptr<DB> CollocatorDB::OpenDb(const char *dbname) {
515 DB *db;
516 Options options;
517
518 options.env->SetBackgroundThreads(4);
519 options.create_if_missing = true;
520 options.merge_operator = std::make_shared<CountMergeOperator>();
521 options.max_successive_merges = 0;
522 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
523 options.IncreaseParallelism();
524 options.OptimizeLevelStyleCompaction();
525 // options.max_write_buffer_number = 48;
526 // options.max_background_jobs = 48;
527 // options.allow_concurrent_memtable_write=true;
528 // options.memtable_factory.reset(NewHashLinkListRepFactory(200000));
529 // options.enable_write_thread_adaptive_yield = 1;
530 // options.allow_concurrent_memtable_write = 1;
531 // options.memtable_factory.reset(new SkipListFactory);
532 // options.write_buffer_size = 1 << 22;
533 // options.allow_mmap_reads = true;
534 // options.allow_mmap_writes = true;
535 // options.max_background_compactions = 40;
536 // BlockBasedTableOptions table_options;
537 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
538 // options.bloom_locality = 1;
539 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
540 // table_options.block_cache = cache;
541 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
542 Status s;
543 // DestroyDB(dbname, Options());
544 s = DB::Open(options, dbname, &db);
545 if (!s.ok()) {
546 std::cerr << s.ToString() << std::endl;
547 assert(false);
548 }
549 total = 1000;
550 return std::shared_ptr<DB>(db);
551}
552
553CollocatorIterator *
554CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const {
555 ReadOptions options;
556 options.prefix_same_as_start = true;
557 char prefixc[sizeof(uint64_t)];
558 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
559 Iterator *it = db_->NewIterator(options);
560 auto *cit = new CollocatorIterator(it);
561 if (w2 > 0)
562 cit->Seek(std::string(prefixc, 6));
563 else
564 cit->Seek(std::string(prefixc, 3));
565 cit->setPrefix(prefixc);
566 return cit;
567}
568
569void CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) const {
570 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
571 for (; it->isValid(); it->Next()) {
572 uint64_t value = it->intValue();
573 uint64_t key = it->intKey();
574 std::cout << "w1:" << W1(key) << ", w2:" << W2(key)
575 << ", dist:" << (int32_t)DIST(key) << " - count:" << value
576 << std::endl;
577 }
578 std::cout << "ready dumping\n";
579}
580
581bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) {
582 return lhs.npmi > rhs.npmi;
583}
584
585bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) {
586 return lhs.lfmd > rhs.lfmd;
587}
588
589bool sortByLlr(const Collocator &lhs, const Collocator &rhs) {
590 return lhs.llr > rhs.llr;
591}
592
593bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) {
594 return lhs.logdice > rhs.logdice;
595}
596
597bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) {
598 return lhs.ldaf > rhs.ldaf;
599}
600
601void CollocatorDB::applyCAMeasures(
602 const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
603 const uint64_t sum, const int usedPositions, int true_window_size,
604 Collocator *result) const {
605 uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
606 double o = sum, r1 = f1 * true_window_size, c1 = f2, e = r1 * c1 / total,
607 pmi = log2(o / e), md = log2(o * o / e), lfmd = log2(o * o * o / e),
Marc Kupietze889cec2024-11-23 12:08:42 +0100608 llr = ca_ll(f1, f2, sum, total, true_window_size),
609 md_nws = ca_md(f1, f2, sum, total, 2 * WINDOW_SIZE),
610 ld = ca_logdice(f1, f2, sum, total, true_window_size);
Marc Kupietz39887082024-11-22 18:06:20 +0100611
612 int bestWindow = usedPositions;
613 double bestAF = ld;
614 // if(f1<75000000)
615 // #pragma omp parallel for reduction(max:bestAF)
616 // #pragma omp target teams distribute parallel for reduction(max:bestAF)
617 // map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
618 for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
619 if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0)
620 continue;
621 uint64_t currentWindowSum = 0;
622 // #pragma omp target teams distribute parallel for
623 // reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
624 for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
625 if (((1 << pos) & bitmask & usedPositions) != 0)
626 currentWindowSum += sumWindow[pos];
627 }
628 double currentAF = ca_logdice(f1, f2, currentWindowSum, total,
629 __builtin_popcount(bitmask));
630 if (currentAF > bestAF) {
631 bestAF = currentAF;
632 bestWindow = bitmask;
633 }
634 }
635
636 *result = {w2,
637 f2,
638 sum,
639 pmi,
640 pmi / (-log2(o / total / true_window_size)),
641 llr,
642 lfmd,
643 md,
Marc Kupietze889cec2024-11-23 12:08:42 +0100644 md_nws,
Marc Kupietz39887082024-11-22 18:06:20 +0100645 sumWindow[WINDOW_SIZE],
646 sumWindow[WINDOW_SIZE - 1],
647 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
648 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
649 ca_dice(f1, f2, sum, total, true_window_size),
650 ld,
651 bestAF,
652 usedPositions,
653 bestWindow};
654}
655
656std::vector<Collocator>
657CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2,
658 uint32_t max_w2) {
659 std::vector<Collocator> collocators;
660 uint64_t w2, last_w2 = 0xffffffffffffffff;
661 uint64_t maxv = 0, sum = 0;
662 auto *sumWindow =
663 static_cast<uint64_t *>(malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE));
664 memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
665 int true_window_size = 1;
666 int usedPositions = 0;
667
668 if (w1 > _vocab.size()) {
669 std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
670 w1 -= _vocab.size();
671 }
672#ifdef DEBUG
673 std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
674#endif
675 // #pragma omp parallel num_threads(40)
676 // #pragma omp single
677 for (auto it =
678 std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0));
679 it->isValid(); it->Next()) {
680 uint64_t value = it->intValue(), key = it->intKey();
681 if ((w2 = W2(key)) > max_w2)
682 continue;
683 if (last_w2 == 0xffffffffffffffff)
684 last_w2 = w2;
685 if (w2 != last_w2) {
686 if (sum >= FREQUENCY_THRESHOLD) {
687 collocators.push_back({});
688 Collocator *result = &(collocators[collocators.size() - 1]);
689 // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions,
690 // true_window_size) shared(w1, result) if(sum > 1000000)
691 {
692 // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2
693 // *WINDOW_SIZE); memcpy(nsw, sumWindow, sizeof(uint64_t) * 2
694 // *WINDOW_SIZE);
695 applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions,
696 true_window_size, result);
697 // free(nsw);
698 }
699 }
700 memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
701 usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
702 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
703 last_w2 = w2;
704 maxv = value;
705 sum = value;
706 true_window_size = 1;
707 if (min_w2 == max_w2 && w2 != min_w2)
708 break;
709 } else {
710 sum += value;
711 if (value > maxv)
712 maxv = value;
713 usedPositions |=
714 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
715 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
716 true_window_size++;
717 }
718 }
719
720 // #pragma omp taskwait
721 sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
722
723#ifdef DEBUG
724 int i = 0;
725 for (Collocator c : collocators) {
726 if (i++ > 10)
727 break;
728 std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word
729 << "*"
730 << "\t f(w1):" << _vocab[w1].freq
731 << "\t f(w2):" << _vocab[c.w2].freq << "\t f(w1, w2):" << c.raw
732 << "\t pmi:" << c.pmi << "\t npmi:" << c.npmi
733 << "\t llr:" << c.llr << "\t md:" << c.md << "\t lfmd:" << c.lfmd
734 << "\t total:" << total << std::endl;
735 }
736#endif
737
738 return collocators;
739}
740
741std::vector<Collocator>
742CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
743 return get_collocators(w1, w2, w2);
744}
745
746std::vector<Collocator> CollocatorDB::get_collocators(uint32_t w1) {
747 return get_collocators(w1, 0, UINT32_MAX);
748}
749
750void CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
751 std::vector<Collocator> collocators;
752 std::stringstream stream;
753 uint64_t w2, last_w2 = 0xffffffffffffffff;
754 uint64_t maxv = 0, total_w1 = 0;
755 bool first = true;
756 for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0));
757 it->isValid(); it->Next()) {
758 uint64_t value = it->intValue(), key = it->intKey();
759 w2 = W2(key);
760 total_w1 += value;
761 if (last_w2 == 0xffffffffffffffff)
762 last_w2 = w2;
763 if (w2 != last_w2) {
764 if (maxv >= min_cooccur) {
765 double llr =
766 ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
767 if (first)
768 first = false;
769 else
770 stream << " ";
771 stream << w2 << " " << llr;
772 }
773 last_w2 = w2;
774 maxv = value;
775 } else {
776 if (value > maxv)
777 maxv = value;
778 }
779 }
780 if (first)
781 stream << "1 0.0";
782 stream << "\n";
783 std::cout << stream.str();
784}
785
786Slice CollocatorIterator::key() const {
787 return base_iterator_->key();
788}
789
790Slice CollocatorIterator::value() const {
791 return base_iterator_->value();
792}
793
794Status CollocatorIterator::status() const {
795 return base_iterator_->status();
796}
797
798}; // namespace rocksdb
799
800string CollocatorDB::getWord(uint32_t w1) { return _vocab[w1].word; }
801
802uint64_t CollocatorDB::getWordId(const char *word) const {
Marc Kupietz979580e2024-11-21 18:05:07 +0100803 for (uint64_t i = 0; i < _vocab.size(); i++) {
804 if (strcmp(_vocab[i].word.c_str(), word) == 0)
805 return i;
806 }
807 return 0;
808}
809
Marc Kupietz39887082024-11-22 18:06:20 +0100810string CollocatorDB::collocators2json(uint32_t w1,
811 const vector<Collocator>& collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100812 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100813 int i = 0;
Marc Kupietz39887082024-11-22 18:06:20 +0100814 s << " { \"f1\": " << _vocab[w1].freq << "," << R"("w1":")"
815 << string(_vocab[w1].word) << "\", " << "\"N\": " << total << ", "
816 << "\"collocates\": [";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100817 bool first = true;
818 for (Collocator c : collocators) {
Marc Kupietz39887082024-11-22 18:06:20 +0100819 if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0)
820 continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100821 if (i++ > 200)
822 break;
Marc Kupietz12af0192021-03-13 18:05:14 +0100823 if (!first)
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100824 s << ",\n";
825 else
826 first = false;
827 s << "{"
Marc Kupietz39887082024-11-22 18:06:20 +0100828 "\"word\":\""
829 << (string(_vocab[c.w2].word) == "<num>"
830 ? string("###")
831 : string(_vocab[c.w2].word))
832 << "\"," << "\"f2\":" << c.f2 << "," << "\"f\":" << c.raw << ","
833 << "\"npmi\":" << c.npmi << "," << "\"pmi\":" << c.pmi << ","
834 << "\"llr\":" << c.llr << "," << "\"lfmd\":" << c.lfmd << ","
Marc Kupietze889cec2024-11-23 12:08:42 +0100835 << "\"md\":" << c.md << "," << "\"md_nws\":" << c.md_nws << "," << "\"dice\":" << c.dice << ","
Marc Kupietz39887082024-11-22 18:06:20 +0100836 << "\"ld\":" << c.logdice << "," << "\"ln_count\":" << c.left_raw << ","
837 << "\"rn_count\":" << c.right_raw << "," << "\"ln_pmi\":" << c.left_pmi
838 << "," << "\"rn_pmi\":" << c.right_pmi << "," << "\"ldaf\":" << c.ldaf
839 << "," << "\"win\":" << c.window << "," << "\"afwin\":" << c.af_window
840 << "}";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100841 }
Marc Kupietze9627152019-02-04 12:32:12 +0100842 s << "]}\n";
Marc Kupietz0421d092021-03-13 18:05:14 +0100843 // std::cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100844 return s.str();
845}
846
Marc Kupietz39887082024-11-22 18:06:20 +0100847typedef CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100848
849extern "C" {
Marc Kupietz12af0192021-03-13 18:05:14 +0100850#ifdef __clang__
851#pragma clang diagnostic push
852#pragma ide diagnostic ignored "OCUnusedGlobalDeclarationInspection"
853#endif
Marc Kupietz39887082024-11-22 18:06:20 +0100854DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
855 return new CollocatorDB(dbname, false);
856}
Marc Kupietz12af0192021-03-13 18:05:14 +0100857
Marc Kupietz39887082024-11-22 18:06:20 +0100858DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
859 return new CollocatorDB(dbname, true);
860}
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100861
Marc Kupietz39887082024-11-22 18:06:20 +0100862DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2,
863 int8_t dist) {
864 db->inc(w1, w2, dist);
865}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100866
Marc Kupietz39887082024-11-22 18:06:20 +0100867DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2,
868 int8_t dist) {
869 db->dump(w1, w2, dist);
870}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100871
Marc Kupietz39887082024-11-22 18:06:20 +0100872DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
873 std::vector<Collocator> c = db->get_collocators(w1);
874 if (c.empty())
875 return nullptr;
876 uint64_t size = c.size() + sizeof c[0];
877 auto *p = (COLLOCATORS *)malloc(size);
878 memcpy(p, c.data(), size);
879 return p;
880}
Marc Kupietz88d116b2021-03-13 18:05:14 +0100881
Marc Kupietz39887082024-11-22 18:06:20 +0100882DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1,
883 uint32_t w2) {
884 std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
885 if (c.empty())
886 return nullptr;
887 uint64_t size = c.size() + sizeof c[0];
888 auto *p = (COLLOCATORS *)malloc(size);
889 memcpy(p, c.data(), size);
890 return p;
891}
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200892
Marc Kupietz39887082024-11-22 18:06:20 +0100893DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
894 return strdup(db->getWord(w).c_str());
895}
Marc Kupietz979580e2024-11-21 18:05:07 +0100896
Marc Kupietz39887082024-11-22 18:06:20 +0100897DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
898 return db->getWordId(word);
899}
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100900
Marc Kupietz39887082024-11-22 18:06:20 +0100901DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
902 std::string fName(fname);
903 db->readVocab(fName);
904}
Marc Kupietz88d116b2021-03-13 18:05:14 +0100905
Marc Kupietz39887082024-11-22 18:06:20 +0100906DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
907 return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
908}
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100909
Marc Kupietz39887082024-11-22 18:06:20 +0100910DLL_EXPORT const char *
911get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
912 return strdup(
913 db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
914}
915
916DLL_EXPORT const char *get_version() { return PROJECT_VERSION; }
Marc Kupietz6208fd72024-11-15 15:46:19 +0100917
Marc Kupietz12af0192021-03-13 18:05:14 +0100918#ifdef __clang__
919#pragma clang diagnostic push
920#endif
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100921}