Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 1 | #ifdef __cplusplus |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 2 | #include <typeinfo> |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 3 | #include "rocksdb/db.h" |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 4 | #endif |
| 5 | #include <stdint.h> |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 6 | |
| 7 | #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) |
| 8 | #define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1) |
| 9 | #define W1(key) (uint64_t)(key & 0xffffff) |
| 10 | #define W2(key) (uint64_t)((key >> 24) & 0xffffff) |
| 11 | #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff)) |
| 12 | |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 13 | #ifdef __cplusplus |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 14 | namespace rocksdb { |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 15 | class Collocator { |
| 16 | public: |
| 17 | uint64_t w2; |
Marc Kupietz | cc6c459 | 2019-01-23 10:11:23 +0100 | [diff] [blame] | 18 | uint64_t f2; |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 19 | uint64_t raw; |
| 20 | double pmi; |
| 21 | double npmi; |
| 22 | double llr; |
| 23 | double lfmd; |
Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame] | 24 | double md; |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 25 | double left_lfmd; |
| 26 | double right_lfmd; |
| 27 | double left_npmi; |
| 28 | double right_npmi; |
Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame] | 29 | double dice; |
| 30 | double logdice; |
Marc Kupietz | 3203e4c | 2019-02-04 12:42:45 +0100 | [diff] [blame] | 31 | double ldaf; |
Marc Kupietz | e9f5893 | 2019-01-24 15:12:59 +0100 | [diff] [blame] | 32 | int window; |
| 33 | int af_window; |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 34 | }; |
| 35 | |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 36 | class CollocatorIterator : public Iterator { |
| 37 | public: |
| 38 | CollocatorIterator(const Iterator& it); |
| 39 | void SeekToFirst(); |
| 40 | void SeekToLast(); |
| 41 | void Seek(const rocksdb::Slice&); |
| 42 | void Prev(); |
| 43 | bool isValid(); |
| 44 | uint64_t intValue(); |
| 45 | uint64_t intKey(); |
| 46 | }; |
| 47 | |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 48 | extern "C" { |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 49 | class CollocatorDB { |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 50 | public: |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 51 | std::string getWord(uint32_t w1); |
| 52 | std::vector<Collocator> get_collocators(uint32_t w1); |
Marc Kupietz | bd96619 | 2018-10-13 14:14:37 +0200 | [diff] [blame] | 53 | std::vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2); |
Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 54 | void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur); |
| 55 | CollocatorDB(const char *db_name, const bool read_only); |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 56 | void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist); |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 57 | void dump(const uint32_t w1, const uint32_t w2, const uint8_t dist); |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 58 | CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist); |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 59 | }; |
| 60 | |
| 61 | } |
Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 62 | } |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 63 | |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 64 | typedef rocksdb::CollocatorDB COLLOCATORDB; |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 65 | |
| 66 | #else |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 67 | typedef struct COLLOCATORDB COLLOCATORDB; |
Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 68 | #endif |
| 69 | |
Marc Kupietz | 88d116b | 2021-03-13 18:05:14 +0100 | [diff] [blame^] | 70 | extern COLLOCATORDB *open_collocatordb(const char *s); |
| 71 | extern COLLOCATORDB *open_collocatordb_for_write(const char *s); |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 72 | extern void inc_collocator(COLLOCATORDB *db, uint64_t w1, uint64_t w2, int8_t dist); |
| 73 | extern void dump_collocators(COLLOCATORDB *db, uint32_t w1, uint32_t w2, int8_t dist); |
| 74 | extern void get_collocators(COLLOCATORDB *db, uint32_t w1); |
Marc Kupietz | 88d116b | 2021-03-13 18:05:14 +0100 | [diff] [blame^] | 75 | extern void get_collocation_scores(COLLOCATORDB *db, uint32_t w1, uint32_t w2); |
Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 76 | extern char *get_collocators_as_json(COLLOCATORDB *db, uint32_t w1); |
Marc Kupietz | 88d116b | 2021-03-13 18:05:14 +0100 | [diff] [blame^] | 77 | extern char *get_collocation_scores_as_json(COLLOCATORDB *db, uint32_t w1, uint32_t w2); |
Marc Kupietz | ca3a52e | 2018-06-05 14:16:23 +0200 | [diff] [blame] | 78 | extern char *get_word(COLLOCATORDB *db, uint32_t w1); |