blob: 41d0ad046c3f56739738856abab830d09795464f [file] [log] [blame]
Marc Kupietz4b799e92018-01-02 11:04:56 +01001#define EXPORT __attribute__((visibility("visible")))
2#define IMPORT
Marc Kupietz12af0192021-03-13 18:05:14 +01003
Marc Kupietz39887082024-11-22 18:06:20 +01004#include "config.h"
5#include "export.h"
6#include "merge_operators.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +01007#include "rocksdb/db.h"
8#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +01009#include "rocksdb/table.h"
Marc Kupietz39887082024-11-22 18:06:20 +010010#include <algorithm>
11#include <cassert>
12#include <cmath>
13#include <cstdint>
14#include <iostream>
15#include <memory>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010016#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010017#include <rocksdb/slice_transform.h>
Marc Kupietz39887082024-11-22 18:06:20 +010018#include <sstream> // for ostringstream
19#include <string>
20#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021
Marc Kupietz75af60f2019-01-22 22:34:29 +010022#define WINDOW_SIZE 5
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010023#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010024#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
Marc Kupietz39887082024-11-22 18:06:20 +010025#define encodeCollocation(w1, w2, dist) \
26 (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010027#define W1(key) (uint64_t)(key & 0xffffff)
28#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
29#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010030
31typedef struct {
32 uint64_t freq;
33 char *word;
Marc Kupietz12af0192021-03-13 18:05:14 +010034} vocab_entry;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010035
36// typedef struct Collocator {
37// uint64_t w2;
38// uint64_t sum;
39// };
40
Marc Kupietz28cc53e2017-12-23 17:24:55 +010041using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010042using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043
Marc Kupietz4b799e92018-01-02 11:04:56 +010044namespace rocksdb {
Marc Kupietz39887082024-11-22 18:06:20 +010045class Collocator {
46public:
47 uint32_t w2;
48 uint64_t f2;
49 uint64_t raw;
50 double pmi;
51 double npmi;
52 double llr;
53 double lfmd;
54 double md;
55 uint64_t left_raw;
56 uint64_t right_raw;
57 double left_pmi;
58 double right_pmi;
59 double dice;
60 double logdice;
61 double ldaf;
62 int window;
63 int af_window;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010064};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010065
Marc Kupietz39887082024-11-22 18:06:20 +010066size_t num_merge_operator_calls;
67
68void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
69
70size_t num_partial_merge_calls;
71
72void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
73
74inline void EncodeFixed64(char *buf, uint64_t value) {
75 if (!IS_BIG_ENDIAN) {
76 memcpy(buf, &value, sizeof(value));
77 } else {
78 buf[0] = value & 0xff;
79 buf[1] = (value >> 8) & 0xff;
80 buf[2] = (value >> 16) & 0xff;
81 buf[3] = (value >> 24) & 0xff;
82 buf[4] = (value >> 32) & 0xff;
83 buf[5] = (value >> 40) & 0xff;
84 buf[6] = (value >> 48) & 0xff;
85 buf[7] = (value >> 56) & 0xff;
86 }
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020087}
88
Marc Kupietz39887082024-11-22 18:06:20 +010089inline uint32_t DecodeFixed32(const char *ptr) {
90 if (!IS_BIG_ENDIAN) {
91 // Load the raw bytes
92 uint32_t result;
93 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
94 return result;
95 } else {
96 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
97 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
98 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
99 (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
100 }
101}
102
103inline uint64_t DecodeFixed64(const char *ptr) {
104 if (!IS_BIG_ENDIAN) {
105 // Load the raw bytes
106 uint64_t result;
107 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
108 return result;
109 } else {
110 uint64_t lo = DecodeFixed32(ptr);
111 uint64_t hi = DecodeFixed32(ptr + 4);
112 return (hi << 32) | lo;
113 }
114}
115
116static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12,
117 uint64_t total, double window_size) {
118 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
119 if (f12 < FREQUENCY_THRESHOLD)
120 return -1.0;
121 else
122 return log2(o / e);
123}
124
125// Bouma, Gerlof (2009): <a
126// href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
127// Normalized (pointwise) mutual information in collocation extraction</a>. In
128// Proceedings of GSCL.
129static double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12,
130 uint64_t total, double window_size) {
131 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
132 if (f12 < FREQUENCY_THRESHOLD)
133 return -1.0;
134 else
135 return log2(o / e) / (-log2(o / total / window_size));
136}
137
138// Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of
139// collocation extraction metrics. In: International Conference on Language
140// Resources and Evaluation (LREC-2002). (2002) 620–625 double md =
141// log2(pow((double)max * window_size / total, 2) / (window_size *
142// ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
143static double ca_md(uint64_t f1, uint64_t f2, uint64_t f12,
144 uint64_t total, double window_size) {
145 const double r1 = f1 * window_size;
146 const double c1 = f2;
147 const double e = r1 * c1 / total;
148 const double o = f12;
149 return log2(o * o / e);
150}
151
152static double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12,
153 uint64_t total, double window_size) {
154 double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
155 if (f12 == 0)
156 return 0;
157 return log2(o * o * o / e);
158}
159
160// Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and
161// Collocations. PhD dissertation, IMS, University of Stuttgart. Published in
162// 2005, URN urn:nbn:de:bsz:93-opus-23714. Free PDF available from
163// http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
164static double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
165 uint64_t window_size) {
166 double r1 = (double)w1 * window_size, r2 = (double)n - r1, c1 = w2,
167 c2 = n - c1, o11 = w12, o12 = r1 - o11, o21 = c1 - w12, o22 = r2 - o21,
168 e11 = r1 * c1 / n, e12 = r1 * c2 / n, e21 = r2 * c1 / n,
169 e22 = r2 * c2 / n;
170 return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) +
171 (o12 > 0 ? o12 * log(o12 / e12) : 0) +
172 (o21 > 0 ? o21 * log(o21 / e21) : 0) +
173 (o22 > 0 ? o22 * log(o22 / e22) : 0)));
174}
175
176static double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
177 uint64_t window_size) {
178 double r1 = (double)w1 * window_size, c1 = w2;
179 return 2 * w12 / (c1 + r1);
180}
181
182// Rychlý, Pavel (2008): <a
183// href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A
184// lexicographer-friendly association score.</a> In Proceedings of Recent
185// Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
186static double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12,
187 uint64_t n, uint64_t window_size) {
188 double r1 = (double)w1 * window_size, c1 = w2;
189 return 14 + log2(2 * w12 / (c1 + r1));
190}
191
192class CountMergeOperator : public AssociativeMergeOperator {
193public:
194 CountMergeOperator() {
195 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
196 }
197
198 bool Merge(const Slice &key, const Slice *existing_value,
199 const Slice &value, std::string *new_value,
200 Logger *logger) const override {
201 assert(new_value->empty());
202 ++num_merge_operator_calls;
203 if (existing_value == nullptr) {
204 new_value->assign(value.data(), value.size());
205 return true;
206 }
207
208 return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
209 logger);
210 }
211
212 const char *Name() const override { return "UInt64AddOperator"; }
213
214private:
215 std::shared_ptr<MergeOperator> mergeOperator_;
216};
217
218class CollocatorIterator : public Iterator {
219 char prefixc[sizeof(uint64_t)]{};
220 Iterator *base_iterator_;
221
222public:
223 explicit CollocatorIterator(Iterator *base_iterator) : base_iterator_(base_iterator) {}
224
225 void setPrefix(char *prefix) { memcpy(prefixc, prefix, sizeof(uint64_t)); }
226
227 void SeekToFirst() override { base_iterator_->SeekToFirst(); }
228
229 void SeekToLast() override { base_iterator_->SeekToLast(); }
230
231 void Seek(const rocksdb::Slice &s) override { base_iterator_->Seek(s); }
232
233 void SeekForPrev(const rocksdb::Slice &s) override {
234 base_iterator_->SeekForPrev(s);
235 }
236
237 void Prev() override { base_iterator_->Prev(); }
238
239 void Next() override { base_iterator_->Next(); }
240
241 Slice key() const override;
242
243 Slice value() const override;
244
245 Status status() const override;
246
247 bool Valid() const override;
248
249 bool isValid();
250
251 uint64_t intValue();
252
253 uint64_t intKey();
254};
255
256// rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
257
258bool CollocatorIterator::Valid() const {
259 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
260}
261
262bool CollocatorIterator::isValid() {
263 return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
264 // return key().starts_with(std::string(prefixc,3));
265}
266
267uint64_t CollocatorIterator::intKey() {
268 return DecodeFixed64(base_iterator_->key().data());
269}
270
271uint64_t CollocatorIterator::intValue() {
272 return DecodeFixed64(base_iterator_->value().data());
273}
274
275class VocabEntry {
276public:
277 string word;
278 uint64_t freq;
279};
280
281class CollocatorDB {
282 WriteOptions merge_option_; // for merge
283 char _one[sizeof(uint64_t)]{};
284 Slice _one_slice;
285 vector<VocabEntry> _vocab;
286 uint64_t total = 0;
287 uint64_t sentences = 0;
288 float avg_window_size = 8.0;
289
290protected:
291 std::shared_ptr<DB> db_;
292
293 WriteOptions put_option_;
294 ReadOptions get_option_;
295 WriteOptions delete_option_;
296
297 uint64_t default_{};
298
299 std::shared_ptr<DB> OpenDb(const char *dbname);
300
301 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
302
303public:
304 virtual ~CollocatorDB() = default;
305 void readVocab(const string& fname);
306 string getWord(uint32_t w1);
307
308 uint64_t getWordId(const char *word) const;
309
310 CollocatorDB(const char *db_name, bool read_only);
311
312 // public interface of CollocatorDB.
313 // All four functions return false
314 // if the underlying level db operation failed.
315
316 // mapped to a levedb Put
317 bool set(const std::string &key, uint64_t value) {
318 // just treat the internal rep of int64 as the string
319 char buf[sizeof(value)];
320 EncodeFixed64(buf, value);
321 Slice slice(buf, sizeof(value));
322 auto s = db_->Put(put_option_, key, slice);
323
324 if (s.ok()) {
325 return true;
326 } else {
327 std::cerr << s.ToString() << std::endl;
328 return false;
329 }
330 }
331
332 DB *getDb() { return db_.get(); }
333
334 // mapped to a rocksdb Delete
335 bool remove(const std::string &key) {
336 auto s = db_->Delete(delete_option_, key);
337
338 if (s.ok()) {
339 return true;
340 } else {
341 std::cerr << s.ToString() << std::endl;
342 return false;
343 }
344 }
345
346 // mapped to a rocksdb Get
347 bool get(const std::string &key, uint64_t *value) {
348 std::string str;
349 auto s = db_->Get(get_option_, key, &str);
350
351 if (s.IsNotFound()) {
352 // return default value if not found;
353 *value = default_;
354 return true;
355 } else if (s.ok()) {
356 // deserialization
357 if (str.size() != sizeof(uint64_t)) {
358 std::cerr << "value corruption\n";
359 return false;
360 }
361 *value = DecodeFixed64(&str[0]);
362 return true;
363 } else {
364 std::cerr << s.ToString() << std::endl;
365 return false;
366 }
367 }
368
369 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
370 char encoded_key[sizeof(uint64_t)];
371 EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
372 uint64_t value = default_;
373 get(std::string(encoded_key, 8), &value);
374 return value;
375 }
376
377 virtual void inc(const std::string &key) {
378 db_->Merge(merge_option_, key, _one_slice);
379 }
380
381 void inc(const uint64_t key) {
382 char encoded_key[sizeof(uint64_t)];
383 EncodeFixed64(encoded_key, key);
384 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
385 }
386
387 virtual void inc(uint32_t w1, uint32_t w2, uint8_t dist);
388
389 void dump(uint32_t w1, uint32_t w2, int8_t dist) const;
390
391 vector<Collocator> get_collocators(uint32_t w1);
392
393 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
394
395 vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
396
397 vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2,
398 uint32_t max_w2);
399
400 void applyCAMeasures(uint32_t w1, uint32_t w2,
401 uint64_t *sumWindow, uint64_t sum,
402 int usedPositions, int true_window_size,
403 Collocator *result) const;
404
405 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
406
407 string collocators2json(uint32_t w1, const vector<Collocator>& collocators);
408
409 // mapped to a rocksdb Merge operation
410 virtual bool add(const std::string &key, uint64_t value) {
411 char encoded[sizeof(uint64_t)];
412 EncodeFixed64(encoded, value);
413 Slice slice(encoded, sizeof(uint64_t));
414 auto s = db_->Merge(merge_option_, key, slice);
415
416 if (s.ok()) {
417 return true;
418 } else {
419 std::cerr << s.ToString() << std::endl;
420 return false;
421 }
422 }
423
424 CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const;
425};
426
427CollocatorDB::CollocatorDB(const char *db_name,
428 bool read_only = false) {
429 // merge_option_.sync = true;
430 if (read_only)
431 db_ = OpenDbForRead(strdup(db_name));
432 else
433 db_ = OpenDb(db_name);
434 assert(db_);
435 uint64_t one = 1;
436 EncodeFixed64(_one, one);
437 _one_slice = Slice(_one, sizeof(uint64_t));
438}
439
440void CollocatorDB::inc(const uint32_t w1, const uint32_t w2,
441 const uint8_t dist) {
442 inc(encodeCollocation(w1, w2, dist));
443}
444
445void CollocatorDB::readVocab(const string& fname) {
446 char strbuf[2048];
447 uint64_t freq;
448 FILE *fin = fopen(fname.c_str(), "rb");
449 if (fin == nullptr) {
450 cout << "Vocabulary file " << fname << " not found\n";
451 exit(1);
452 }
453 uint64_t i = 0;
454 while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
455 _vocab.push_back({strbuf, freq});
456 total += freq;
457 i++;
458 }
459 fclose(fin);
460
461 char size_fname[256];
462 strcpy(size_fname, fname.c_str());
463 char *pos = strstr(size_fname, ".vocab");
464 if (pos) {
465 *pos = 0;
466 strcat(size_fname, ".size");
467 FILE *fp = fopen(size_fname, "r");
468 if (fp != nullptr) {
469 fscanf(fp, "%lu", &sentences);
470 fscanf(fp, "%lu", &total);
471 float sl = (float)total / (float)sentences;
472 float w = WINDOW_SIZE;
473 avg_window_size =
474 ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double)w * (3 * w - 1)) /
475 sl;
476 fprintf(stdout,
477 "Size corrections found: corpus size: %lu tokens in %lu "
478 "sentences, avg. sentence size: %f, avg. window size: %f\n",
479 total, sentences, sl, avg_window_size);
480 fclose(fp);
481 } else {
482 // std::cout << "size file " << size_fname << " not found\n";
483 }
484 } else {
485 std::cout << "cannot determine size file " << size_fname << "\n";
486 }
487}
488
489std::shared_ptr<DB> CollocatorDB::OpenDbForRead(const char *name) {
490 DB *db;
491 Options options;
492 options.env->SetBackgroundThreads(4);
493 options.create_if_missing = true;
494 options.merge_operator = std::make_shared<CountMergeOperator>();
495 options.max_successive_merges = 0;
496 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
497 options.IncreaseParallelism();
498 options.OptimizeLevelStyleCompaction();
499 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
500 ostringstream dbname, vocabname;
501 dbname << name << ".rocksdb";
502 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
503 if (!s.ok()) {
504 std::cerr << s.ToString() << std::endl;
505 assert(false);
506 }
507 vocabname << name << ".vocab";
508 readVocab(vocabname.str());
509 return std::shared_ptr<DB>(db);
510}
511
512std::shared_ptr<DB> CollocatorDB::OpenDb(const char *dbname) {
513 DB *db;
514 Options options;
515
516 options.env->SetBackgroundThreads(4);
517 options.create_if_missing = true;
518 options.merge_operator = std::make_shared<CountMergeOperator>();
519 options.max_successive_merges = 0;
520 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
521 options.IncreaseParallelism();
522 options.OptimizeLevelStyleCompaction();
523 // options.max_write_buffer_number = 48;
524 // options.max_background_jobs = 48;
525 // options.allow_concurrent_memtable_write=true;
526 // options.memtable_factory.reset(NewHashLinkListRepFactory(200000));
527 // options.enable_write_thread_adaptive_yield = 1;
528 // options.allow_concurrent_memtable_write = 1;
529 // options.memtable_factory.reset(new SkipListFactory);
530 // options.write_buffer_size = 1 << 22;
531 // options.allow_mmap_reads = true;
532 // options.allow_mmap_writes = true;
533 // options.max_background_compactions = 40;
534 // BlockBasedTableOptions table_options;
535 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
536 // options.bloom_locality = 1;
537 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
538 // table_options.block_cache = cache;
539 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
540 Status s;
541 // DestroyDB(dbname, Options());
542 s = DB::Open(options, dbname, &db);
543 if (!s.ok()) {
544 std::cerr << s.ToString() << std::endl;
545 assert(false);
546 }
547 total = 1000;
548 return std::shared_ptr<DB>(db);
549}
550
551CollocatorIterator *
552CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const {
553 ReadOptions options;
554 options.prefix_same_as_start = true;
555 char prefixc[sizeof(uint64_t)];
556 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
557 Iterator *it = db_->NewIterator(options);
558 auto *cit = new CollocatorIterator(it);
559 if (w2 > 0)
560 cit->Seek(std::string(prefixc, 6));
561 else
562 cit->Seek(std::string(prefixc, 3));
563 cit->setPrefix(prefixc);
564 return cit;
565}
566
567void CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) const {
568 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
569 for (; it->isValid(); it->Next()) {
570 uint64_t value = it->intValue();
571 uint64_t key = it->intKey();
572 std::cout << "w1:" << W1(key) << ", w2:" << W2(key)
573 << ", dist:" << (int32_t)DIST(key) << " - count:" << value
574 << std::endl;
575 }
576 std::cout << "ready dumping\n";
577}
578
579bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) {
580 return lhs.npmi > rhs.npmi;
581}
582
583bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) {
584 return lhs.lfmd > rhs.lfmd;
585}
586
587bool sortByLlr(const Collocator &lhs, const Collocator &rhs) {
588 return lhs.llr > rhs.llr;
589}
590
591bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) {
592 return lhs.logdice > rhs.logdice;
593}
594
595bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) {
596 return lhs.ldaf > rhs.ldaf;
597}
598
599void CollocatorDB::applyCAMeasures(
600 const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
601 const uint64_t sum, const int usedPositions, int true_window_size,
602 Collocator *result) const {
603 uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
604 double o = sum, r1 = f1 * true_window_size, c1 = f2, e = r1 * c1 / total,
605 pmi = log2(o / e), md = log2(o * o / e), lfmd = log2(o * o * o / e),
606 llr = ca_ll(f1, f2, sum, total, true_window_size);
607 double ld = ca_logdice(f1, f2, sum, total, true_window_size);
608
609 int bestWindow = usedPositions;
610 double bestAF = ld;
611 // if(f1<75000000)
612 // #pragma omp parallel for reduction(max:bestAF)
613 // #pragma omp target teams distribute parallel for reduction(max:bestAF)
614 // map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
615 for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
616 if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0)
617 continue;
618 uint64_t currentWindowSum = 0;
619 // #pragma omp target teams distribute parallel for
620 // reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
621 for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
622 if (((1 << pos) & bitmask & usedPositions) != 0)
623 currentWindowSum += sumWindow[pos];
624 }
625 double currentAF = ca_logdice(f1, f2, currentWindowSum, total,
626 __builtin_popcount(bitmask));
627 if (currentAF > bestAF) {
628 bestAF = currentAF;
629 bestWindow = bitmask;
630 }
631 }
632
633 *result = {w2,
634 f2,
635 sum,
636 pmi,
637 pmi / (-log2(o / total / true_window_size)),
638 llr,
639 lfmd,
640 md,
641 sumWindow[WINDOW_SIZE],
642 sumWindow[WINDOW_SIZE - 1],
643 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
644 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
645 ca_dice(f1, f2, sum, total, true_window_size),
646 ld,
647 bestAF,
648 usedPositions,
649 bestWindow};
650}
651
652std::vector<Collocator>
653CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2,
654 uint32_t max_w2) {
655 std::vector<Collocator> collocators;
656 uint64_t w2, last_w2 = 0xffffffffffffffff;
657 uint64_t maxv = 0, sum = 0;
658 auto *sumWindow =
659 static_cast<uint64_t *>(malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE));
660 memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
661 int true_window_size = 1;
662 int usedPositions = 0;
663
664 if (w1 > _vocab.size()) {
665 std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
666 w1 -= _vocab.size();
667 }
668#ifdef DEBUG
669 std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
670#endif
671 // #pragma omp parallel num_threads(40)
672 // #pragma omp single
673 for (auto it =
674 std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0));
675 it->isValid(); it->Next()) {
676 uint64_t value = it->intValue(), key = it->intKey();
677 if ((w2 = W2(key)) > max_w2)
678 continue;
679 if (last_w2 == 0xffffffffffffffff)
680 last_w2 = w2;
681 if (w2 != last_w2) {
682 if (sum >= FREQUENCY_THRESHOLD) {
683 collocators.push_back({});
684 Collocator *result = &(collocators[collocators.size() - 1]);
685 // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions,
686 // true_window_size) shared(w1, result) if(sum > 1000000)
687 {
688 // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2
689 // *WINDOW_SIZE); memcpy(nsw, sumWindow, sizeof(uint64_t) * 2
690 // *WINDOW_SIZE);
691 applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions,
692 true_window_size, result);
693 // free(nsw);
694 }
695 }
696 memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
697 usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
698 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
699 last_w2 = w2;
700 maxv = value;
701 sum = value;
702 true_window_size = 1;
703 if (min_w2 == max_w2 && w2 != min_w2)
704 break;
705 } else {
706 sum += value;
707 if (value > maxv)
708 maxv = value;
709 usedPositions |=
710 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
711 sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
712 true_window_size++;
713 }
714 }
715
716 // #pragma omp taskwait
717 sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
718
719#ifdef DEBUG
720 int i = 0;
721 for (Collocator c : collocators) {
722 if (i++ > 10)
723 break;
724 std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word
725 << "*"
726 << "\t f(w1):" << _vocab[w1].freq
727 << "\t f(w2):" << _vocab[c.w2].freq << "\t f(w1, w2):" << c.raw
728 << "\t pmi:" << c.pmi << "\t npmi:" << c.npmi
729 << "\t llr:" << c.llr << "\t md:" << c.md << "\t lfmd:" << c.lfmd
730 << "\t total:" << total << std::endl;
731 }
732#endif
733
734 return collocators;
735}
736
737std::vector<Collocator>
738CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
739 return get_collocators(w1, w2, w2);
740}
741
742std::vector<Collocator> CollocatorDB::get_collocators(uint32_t w1) {
743 return get_collocators(w1, 0, UINT32_MAX);
744}
745
746void CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
747 std::vector<Collocator> collocators;
748 std::stringstream stream;
749 uint64_t w2, last_w2 = 0xffffffffffffffff;
750 uint64_t maxv = 0, total_w1 = 0;
751 bool first = true;
752 for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0));
753 it->isValid(); it->Next()) {
754 uint64_t value = it->intValue(), key = it->intKey();
755 w2 = W2(key);
756 total_w1 += value;
757 if (last_w2 == 0xffffffffffffffff)
758 last_w2 = w2;
759 if (w2 != last_w2) {
760 if (maxv >= min_cooccur) {
761 double llr =
762 ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
763 if (first)
764 first = false;
765 else
766 stream << " ";
767 stream << w2 << " " << llr;
768 }
769 last_w2 = w2;
770 maxv = value;
771 } else {
772 if (value > maxv)
773 maxv = value;
774 }
775 }
776 if (first)
777 stream << "1 0.0";
778 stream << "\n";
779 std::cout << stream.str();
780}
781
782Slice CollocatorIterator::key() const {
783 return base_iterator_->key();
784}
785
786Slice CollocatorIterator::value() const {
787 return base_iterator_->value();
788}
789
790Status CollocatorIterator::status() const {
791 return base_iterator_->status();
792}
793
794}; // namespace rocksdb
795
796string CollocatorDB::getWord(uint32_t w1) { return _vocab[w1].word; }
797
798uint64_t CollocatorDB::getWordId(const char *word) const {
Marc Kupietz979580e2024-11-21 18:05:07 +0100799 for (uint64_t i = 0; i < _vocab.size(); i++) {
800 if (strcmp(_vocab[i].word.c_str(), word) == 0)
801 return i;
802 }
803 return 0;
804}
805
Marc Kupietz39887082024-11-22 18:06:20 +0100806string CollocatorDB::collocators2json(uint32_t w1,
807 const vector<Collocator>& collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100808 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100809 int i = 0;
Marc Kupietz39887082024-11-22 18:06:20 +0100810 s << " { \"f1\": " << _vocab[w1].freq << "," << R"("w1":")"
811 << string(_vocab[w1].word) << "\", " << "\"N\": " << total << ", "
812 << "\"collocates\": [";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100813 bool first = true;
814 for (Collocator c : collocators) {
Marc Kupietz39887082024-11-22 18:06:20 +0100815 if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0)
816 continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100817 if (i++ > 200)
818 break;
Marc Kupietz12af0192021-03-13 18:05:14 +0100819 if (!first)
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100820 s << ",\n";
821 else
822 first = false;
823 s << "{"
Marc Kupietz39887082024-11-22 18:06:20 +0100824 "\"word\":\""
825 << (string(_vocab[c.w2].word) == "<num>"
826 ? string("###")
827 : string(_vocab[c.w2].word))
828 << "\"," << "\"f2\":" << c.f2 << "," << "\"f\":" << c.raw << ","
829 << "\"npmi\":" << c.npmi << "," << "\"pmi\":" << c.pmi << ","
830 << "\"llr\":" << c.llr << "," << "\"lfmd\":" << c.lfmd << ","
831 << "\"md\":" << c.md << "," << "\"dice\":" << c.dice << ","
832 << "\"ld\":" << c.logdice << "," << "\"ln_count\":" << c.left_raw << ","
833 << "\"rn_count\":" << c.right_raw << "," << "\"ln_pmi\":" << c.left_pmi
834 << "," << "\"rn_pmi\":" << c.right_pmi << "," << "\"ldaf\":" << c.ldaf
835 << "," << "\"win\":" << c.window << "," << "\"afwin\":" << c.af_window
836 << "}";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100837 }
Marc Kupietze9627152019-02-04 12:32:12 +0100838 s << "]}\n";
Marc Kupietz0421d092021-03-13 18:05:14 +0100839 // std::cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100840 return s.str();
841}
842
Marc Kupietz39887082024-11-22 18:06:20 +0100843typedef CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100844
845extern "C" {
Marc Kupietz12af0192021-03-13 18:05:14 +0100846#ifdef __clang__
847#pragma clang diagnostic push
848#pragma ide diagnostic ignored "OCUnusedGlobalDeclarationInspection"
849#endif
Marc Kupietz39887082024-11-22 18:06:20 +0100850DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
851 return new CollocatorDB(dbname, false);
852}
Marc Kupietz12af0192021-03-13 18:05:14 +0100853
Marc Kupietz39887082024-11-22 18:06:20 +0100854DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
855 return new CollocatorDB(dbname, true);
856}
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100857
Marc Kupietz39887082024-11-22 18:06:20 +0100858DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2,
859 int8_t dist) {
860 db->inc(w1, w2, dist);
861}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100862
Marc Kupietz39887082024-11-22 18:06:20 +0100863DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2,
864 int8_t dist) {
865 db->dump(w1, w2, dist);
866}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100867
Marc Kupietz39887082024-11-22 18:06:20 +0100868DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
869 std::vector<Collocator> c = db->get_collocators(w1);
870 if (c.empty())
871 return nullptr;
872 uint64_t size = c.size() + sizeof c[0];
873 auto *p = (COLLOCATORS *)malloc(size);
874 memcpy(p, c.data(), size);
875 return p;
876}
Marc Kupietz88d116b2021-03-13 18:05:14 +0100877
Marc Kupietz39887082024-11-22 18:06:20 +0100878DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1,
879 uint32_t w2) {
880 std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
881 if (c.empty())
882 return nullptr;
883 uint64_t size = c.size() + sizeof c[0];
884 auto *p = (COLLOCATORS *)malloc(size);
885 memcpy(p, c.data(), size);
886 return p;
887}
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200888
Marc Kupietz39887082024-11-22 18:06:20 +0100889DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
890 return strdup(db->getWord(w).c_str());
891}
Marc Kupietz979580e2024-11-21 18:05:07 +0100892
Marc Kupietz39887082024-11-22 18:06:20 +0100893DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
894 return db->getWordId(word);
895}
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100896
Marc Kupietz39887082024-11-22 18:06:20 +0100897DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
898 std::string fName(fname);
899 db->readVocab(fName);
900}
Marc Kupietz88d116b2021-03-13 18:05:14 +0100901
Marc Kupietz39887082024-11-22 18:06:20 +0100902DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
903 return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
904}
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100905
Marc Kupietz39887082024-11-22 18:06:20 +0100906DLL_EXPORT const char *
907get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
908 return strdup(
909 db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
910}
911
912DLL_EXPORT const char *get_version() { return PROJECT_VERSION; }
Marc Kupietz6208fd72024-11-15 15:46:19 +0100913
Marc Kupietz12af0192021-03-13 18:05:14 +0100914#ifdef __clang__
915#pragma clang diagnostic push
916#endif
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100917}