Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 1 | #ifdef __cplusplus |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 2 | #include <typeinfo> |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 3 | #include "rocksdb/db.h" |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 4 | #endif |
| 5 | #include <stdint.h> |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 6 | |
| 7 | #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) |
| 8 | #define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1) |
| 9 | #define W1(key) (uint64_t)(key & 0xffffff) |
| 10 | #define W2(key) (uint64_t)((key >> 24) & 0xffffff) |
| 11 | #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff)) |
| 12 | |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 13 | #ifdef __cplusplus |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 14 | namespace rocksdb { |
Marc Kupietz | 75d35d2 | 2024-11-21 14:31:44 +0100 | [diff] [blame] | 15 | |
| 16 | class Collocator { |
| 17 | public: |
Marc Kupietz | 0421d09 | 2021-03-13 18:05:14 +0100 | [diff] [blame] | 18 | uint32_t w2; |
| 19 | uint64_t f2; |
| 20 | uint64_t raw; |
| 21 | double pmi; |
| 22 | double npmi; |
| 23 | double llr; |
| 24 | double lfmd; |
| 25 | double md; |
| 26 | uint64_t left_raw; |
| 27 | uint64_t right_raw; |
| 28 | double left_pmi; |
| 29 | double right_pmi; |
| 30 | double dice; |
| 31 | double logdice; |
| 32 | double ldaf; |
| 33 | int window; |
| 34 | int af_window; |
Marc Kupietz | 75d35d2 | 2024-11-21 14:31:44 +0100 | [diff] [blame] | 35 | }; |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 36 | |
Marc Kupietz | 75d35d2 | 2024-11-21 14:31:44 +0100 | [diff] [blame] | 37 | class CollocatorIterator : public rocksdb::Iterator { |
| 38 | public: |
| 39 | CollocatorIterator(rocksdb::Iterator* base_iterator); |
| 40 | void setPrefix(char* prefix); |
| 41 | void SeekToFirst() override; |
| 42 | void SeekToLast() override; |
| 43 | void Seek(const rocksdb::Slice& s) override; |
| 44 | void SeekForPrev(const rocksdb::Slice& s) override; |
| 45 | void Prev() override; |
| 46 | void Next() override; |
| 47 | rocksdb::Slice key() const override; |
| 48 | rocksdb::Slice value() const override; |
| 49 | rocksdb::Status status() const override; |
| 50 | bool Valid() const override; |
| 51 | bool isValid(); |
| 52 | uint64_t intValue(); |
| 53 | uint64_t intKey(); |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 54 | |
Marc Kupietz | 75d35d2 | 2024-11-21 14:31:44 +0100 | [diff] [blame] | 55 | private: |
| 56 | char prefixc[sizeof(uint64_t)]; |
| 57 | rocksdb::Iterator* base_iterator_; |
| 58 | }; |
| 59 | |
| 60 | class CollocatorDB { |
| 61 | public: |
| 62 | CollocatorDB(const char* db_name, bool read_only = false); |
| 63 | void readVocab(std::string fname); |
| 64 | std::string getWord(uint32_t w1); |
| 65 | void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist); |
| 66 | void dump(uint32_t w1, uint32_t w2, int8_t dist); |
| 67 | std::vector<Collocator> get_collocators(uint32_t w1); |
| 68 | std::vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2); |
| 69 | std::vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2); |
| 70 | std::vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2); |
| 71 | void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur); |
| 72 | std::string collocators2json(uint32_t w1, std::vector<Collocator> collocators); |
| 73 | CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist); |
| 74 | |
| 75 | private: |
| 76 | std::shared_ptr<rocksdb::DB> OpenDb(const char* dbname); |
| 77 | std::shared_ptr<rocksdb::DB> OpenDbForRead(const char* dbname); |
| 78 | void applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t* sumWindow, const uint64_t sum, const int usedPositions, int true_window_size, Collocator* result); |
| 79 | |
| 80 | rocksdb::WriteOptions merge_option_; |
| 81 | char _one[sizeof(uint64_t)]; |
| 82 | rocksdb::Slice _one_slice; |
| 83 | std::vector<Collocator> _vocab; |
| 84 | uint64_t total; |
| 85 | uint64_t sentences; |
| 86 | float avg_window_size; |
| 87 | std::shared_ptr<rocksdb::DB> db_; |
| 88 | rocksdb::WriteOptions put_option_; |
| 89 | rocksdb::ReadOptions get_option_; |
| 90 | rocksdb::WriteOptions delete_option_; |
| 91 | uint64_t default_; |
| 92 | }; |
| 93 | |
| 94 | } // namespace rocksdb |
| 95 | |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 96 | |
| 97 | #else |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 98 | typedef struct COLLOCATORDB COLLOCATORDB; |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 99 | |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 100 | typedef struct { |
| 101 | uint32_t w2; |
| 102 | uint64_t f2; |
| 103 | uint64_t raw; |
| 104 | double pmi; |
| 105 | double npmi; |
| 106 | double llr; |
| 107 | double lfmd; |
| 108 | double md; |
Marc Kupietz | e889cec | 2024-11-23 12:08:42 +0100 | [diff] [blame] | 109 | double md_nws; |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 110 | uint64_t left_raw; |
| 111 | uint64_t right_raw; |
| 112 | double left_pmi; |
| 113 | double right_pmi; |
| 114 | double dice; |
| 115 | double logdice; |
| 116 | double ldaf; |
| 117 | int window; |
| 118 | int af_window; |
| 119 | } COLLOCATOR ; |
| 120 | |
Marc Kupietz | 88d116b | 2021-03-13 18:05:14 +0100 | [diff] [blame] | 121 | extern COLLOCATORDB *open_collocatordb(const char *s); |
| 122 | extern COLLOCATORDB *open_collocatordb_for_write(const char *s); |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 123 | extern void inc_collocator(COLLOCATORDB *db, uint64_t w1, uint64_t w2, int8_t dist); |
| 124 | extern void dump_collocators(COLLOCATORDB *db, uint32_t w1, uint32_t w2, int8_t dist); |
Marc Kupietz | 6663f11 | 2021-03-14 09:20:59 +0100 | [diff] [blame] | 125 | extern COLLOCATOR *get_collocators(COLLOCATORDB *db, uint32_t w1); |
| 126 | extern COLLOCATOR *get_collocation_scores(COLLOCATORDB *db, uint32_t w1, uint32_t w2); |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 127 | extern char *get_collocators_as_json(COLLOCATORDB *db, uint32_t w1); |
Marc Kupietz | 88d116b | 2021-03-13 18:05:14 +0100 | [diff] [blame] | 128 | extern char *get_collocation_scores_as_json(COLLOCATORDB *db, uint32_t w1, uint32_t w2); |
Marc Kupietz | ca3a52e | 2018-06-05 14:16:23 +0200 | [diff] [blame] | 129 | extern char *get_word(COLLOCATORDB *db, uint32_t w1); |
Marc Kupietz | b4a683c | 2021-03-14 09:19:44 +0100 | [diff] [blame] | 130 | extern void read_vocab(COLLOCATORDB *db, char *fname); |
Marc Kupietz | 6208fd7 | 2024-11-15 15:46:19 +0100 | [diff] [blame] | 131 | extern char *get_version(); |
Marc Kupietz | 979580e | 2024-11-21 18:05:07 +0100 | [diff] [blame] | 132 | extern uint64_t get_word_id(COLLOCATORDB *db, const char *word); |
Marc Kupietz | d26b105 | 2024-12-10 16:56:39 +0100 | [diff] [blame] | 133 | extern uint64_t get_corpus_size(COLLOCATORDB *db); |
Marc Kupietz | 21b964c | 2024-12-10 17:10:50 +0100 | [diff] [blame] | 134 | extern uint64_t get_word_frequency(COLLOCATORDB *db, uint64_t w1); |
Marc Kupietz | 979580e | 2024-11-21 18:05:07 +0100 | [diff] [blame] | 135 | |
Marc Kupietz | 75d35d2 | 2024-11-21 14:31:44 +0100 | [diff] [blame] | 136 | #endif |