blob: 74f2f0ec2d4de8ab6818c7498e27bcd846d3d7af [file] [log] [blame]
Marc Kupietz06c9a9f2018-01-02 16:56:43 +01001#ifdef __cplusplus
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01003#include "rocksdb/db.h"
Marc Kupietz06c9a9f2018-01-02 16:56:43 +01004#endif
5#include <stdint.h>
Marc Kupietz4b799e92018-01-02 11:04:56 +01006
7#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
8#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
9#define W1(key) (uint64_t)(key & 0xffffff)
10#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
11#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
12
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010013#ifdef __cplusplus
Marc Kupietz4b799e92018-01-02 11:04:56 +010014namespace rocksdb {
Marc Kupietz75d35d22024-11-21 14:31:44 +010015
16class Collocator {
17public:
Marc Kupietz0421d092021-03-13 18:05:14 +010018 uint32_t w2;
19 uint64_t f2;
20 uint64_t raw;
21 double pmi;
22 double npmi;
23 double llr;
24 double lfmd;
25 double md;
26 uint64_t left_raw;
27 uint64_t right_raw;
28 double left_pmi;
29 double right_pmi;
30 double dice;
31 double logdice;
32 double ldaf;
33 int window;
34 int af_window;
Marc Kupietz75d35d22024-11-21 14:31:44 +010035};
Marc Kupietz3400aa52018-06-05 10:28:55 +020036
Marc Kupietz75d35d22024-11-21 14:31:44 +010037class CollocatorIterator : public rocksdb::Iterator {
38public:
39 CollocatorIterator(rocksdb::Iterator* base_iterator);
40 void setPrefix(char* prefix);
41 void SeekToFirst() override;
42 void SeekToLast() override;
43 void Seek(const rocksdb::Slice& s) override;
44 void SeekForPrev(const rocksdb::Slice& s) override;
45 void Prev() override;
46 void Next() override;
47 rocksdb::Slice key() const override;
48 rocksdb::Slice value() const override;
49 rocksdb::Status status() const override;
50 bool Valid() const override;
51 bool isValid();
52 uint64_t intValue();
53 uint64_t intKey();
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010054
Marc Kupietz75d35d22024-11-21 14:31:44 +010055private:
56 char prefixc[sizeof(uint64_t)];
57 rocksdb::Iterator* base_iterator_;
58};
59
60class CollocatorDB {
61public:
62 CollocatorDB(const char* db_name, bool read_only = false);
63 void readVocab(std::string fname);
64 std::string getWord(uint32_t w1);
65 void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
66 void dump(uint32_t w1, uint32_t w2, int8_t dist);
67 std::vector<Collocator> get_collocators(uint32_t w1);
68 std::vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
69 std::vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
70 std::vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2);
71 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
72 std::string collocators2json(uint32_t w1, std::vector<Collocator> collocators);
73 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
74
75private:
76 std::shared_ptr<rocksdb::DB> OpenDb(const char* dbname);
77 std::shared_ptr<rocksdb::DB> OpenDbForRead(const char* dbname);
78 void applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t* sumWindow, const uint64_t sum, const int usedPositions, int true_window_size, Collocator* result);
79
80 rocksdb::WriteOptions merge_option_;
81 char _one[sizeof(uint64_t)];
82 rocksdb::Slice _one_slice;
83 std::vector<Collocator> _vocab;
84 uint64_t total;
85 uint64_t sentences;
86 float avg_window_size;
87 std::shared_ptr<rocksdb::DB> db_;
88 rocksdb::WriteOptions put_option_;
89 rocksdb::ReadOptions get_option_;
90 rocksdb::WriteOptions delete_option_;
91 uint64_t default_;
92};
93
94} // namespace rocksdb
95
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010096
97#else
Marc Kupietz6aec7682018-01-10 09:47:48 +010098typedef struct COLLOCATORDB COLLOCATORDB;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010099
Marc Kupietz6663f112021-03-14 09:20:59 +0100100typedef struct {
101 uint32_t w2;
102 uint64_t f2;
103 uint64_t raw;
104 double pmi;
105 double npmi;
106 double llr;
107 double lfmd;
108 double md;
Marc Kupietze889cec2024-11-23 12:08:42 +0100109 double md_nws;
Marc Kupietz6663f112021-03-14 09:20:59 +0100110 uint64_t left_raw;
111 uint64_t right_raw;
112 double left_pmi;
113 double right_pmi;
114 double dice;
115 double logdice;
116 double ldaf;
117 int window;
118 int af_window;
119} COLLOCATOR ;
120
Marc Kupietz88d116b2021-03-13 18:05:14 +0100121extern COLLOCATORDB *open_collocatordb(const char *s);
122extern COLLOCATORDB *open_collocatordb_for_write(const char *s);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100123extern void inc_collocator(COLLOCATORDB *db, uint64_t w1, uint64_t w2, int8_t dist);
124extern void dump_collocators(COLLOCATORDB *db, uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz6663f112021-03-14 09:20:59 +0100125extern COLLOCATOR *get_collocators(COLLOCATORDB *db, uint32_t w1);
126extern COLLOCATOR *get_collocation_scores(COLLOCATORDB *db, uint32_t w1, uint32_t w2);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100127extern char *get_collocators_as_json(COLLOCATORDB *db, uint32_t w1);
Marc Kupietz88d116b2021-03-13 18:05:14 +0100128extern char *get_collocation_scores_as_json(COLLOCATORDB *db, uint32_t w1, uint32_t w2);
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200129extern char *get_word(COLLOCATORDB *db, uint32_t w1);
Marc Kupietzb4a683c2021-03-14 09:19:44 +0100130extern void read_vocab(COLLOCATORDB *db, char *fname);
Marc Kupietz6208fd72024-11-15 15:46:19 +0100131extern char *get_version();
Marc Kupietz979580e2024-11-21 18:05:07 +0100132extern uint64_t get_word_id(COLLOCATORDB *db, const char *word);
Marc Kupietzd26b1052024-12-10 16:56:39 +0100133extern uint64_t get_corpus_size(COLLOCATORDB *db);
Marc Kupietz21b964c2024-12-10 17:10:50 +0100134extern uint64_t get_word_frequency(COLLOCATORDB *db, uint64_t w1);
Marc Kupietz979580e2024-11-21 18:05:07 +0100135
Marc Kupietz75d35d22024-11-21 14:31:44 +0100136#endif