| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 1 | #include <typeinfo> | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 2 | #define EXPORT __attribute__((visibility("visible"))) | 
 | 3 | #define IMPORT | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 4 | #include <assert.h> | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 5 | #include <inttypes.h> | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 6 | #include <memory> | 
 | 7 | #include <iostream> | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 8 | #include <algorithm> | 
 | 9 | #include <vector> | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 10 | #include <stdint.h> | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 11 | #include <string> | 
 | 12 | #include <sstream> // for ostringstream | 
 | 13 | #include <math.h> | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 14 | #include <rocksdb/cache.h> | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 15 | #include "rocksdb/comparator.h" | 
 | 16 | #include "rocksdb/db.h" | 
 | 17 | #include "rocksdb/env.h" | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 18 | #include "rocksdb/table.h" | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 19 | #include <rocksdb/merge_operator.h> | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 20 | #include <rocksdb/slice_transform.h> | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 21 | #include "rocksdb/utilities/db_ttl.h" | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 22 | #include "rocksdb/filter_policy.h" | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 23 | #include "merge_operators.h" | 
 | 24 |  | 
| Marc Kupietz | 8cf7e91 | 2019-01-21 17:05:23 +0100 | [diff] [blame] | 25 | #define WINDOW_SIZE 5.0 | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 26 | #define FREQUENCY_THRESHOLD 5 | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 27 | #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) | 
 | 28 | #define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1) | 
| Marc Kupietz | 18375e1 | 2017-12-24 10:11:18 +0100 | [diff] [blame] | 29 | #define W1(key) (uint64_t)(key & 0xffffff) | 
 | 30 | #define W2(key) (uint64_t)((key >> 24) & 0xffffff) | 
 | 31 | #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff)) | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 32 |  | 
 | 33 | typedef struct { | 
 | 34 |   uint64_t freq; | 
 | 35 |   char *word; | 
 | 36 | }  vocab_entry; | 
 | 37 |  | 
 | 38 | // typedef struct Collocator { | 
 | 39 | //   uint64_t w2; | 
 | 40 | //   uint64_t sum; | 
 | 41 | // }; | 
 | 42 |  | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 43 | using namespace rocksdb; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 44 | using namespace std; | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 45 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 46 | namespace rocksdb { | 
| Marc Kupietz | 4a5e08a | 2018-06-05 11:07:11 +0200 | [diff] [blame] | 47 |     class Collocator { | 
 | 48 |     public: | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 49 |     uint64_t w2; | 
| Marc Kupietz | 51f9379 | 2018-01-25 08:51:01 +0100 | [diff] [blame] | 50 |     uint64_t raw; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 51 |     double pmi; | 
 | 52 |     double npmi; | 
 | 53 |     double llr; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 54 |     double lfmd; | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 55 |     double md; | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 56 |     double left_lfmd; | 
 | 57 |     double right_lfmd; | 
 | 58 |     double left_npmi; | 
 | 59 |     double right_npmi; | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 60 |     double dice; | 
 | 61 |     double logdice; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 62 |   }; | 
 | 63 |  | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 64 |   size_t num_merge_operator_calls; | 
 | 65 |   void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; } | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 66 |  | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 67 |   size_t num_partial_merge_calls; | 
 | 68 |   void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 69 |  | 
 | 70 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 71 |   inline void EncodeFixed64(char* buf, uint64_t value) { | 
 | 72 |     if (! IS_BIG_ENDIAN) { | 
 | 73 |       memcpy(buf, &value, sizeof(value)); | 
 | 74 |     } else { | 
 | 75 |       buf[0] = value & 0xff; | 
 | 76 |       buf[1] = (value >> 8) & 0xff; | 
 | 77 |       buf[2] = (value >> 16) & 0xff; | 
 | 78 |       buf[3] = (value >> 24) & 0xff; | 
 | 79 |       buf[4] = (value >> 32) & 0xff; | 
 | 80 |       buf[5] = (value >> 40) & 0xff; | 
 | 81 |       buf[6] = (value >> 48) & 0xff; | 
 | 82 |       buf[7] = (value >> 56) & 0xff; | 
 | 83 |     } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 84 |   } | 
 | 85 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 86 |   inline uint32_t DecodeFixed32(const char* ptr) { | 
 | 87 |     if (! IS_BIG_ENDIAN) { | 
 | 88 |       // Load the raw bytes | 
 | 89 |       uint32_t result; | 
 | 90 |       memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load | 
 | 91 |       return result; | 
 | 92 |     } else { | 
 | 93 |       return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) | 
 | 94 |               | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) | 
 | 95 |               | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) | 
 | 96 |               | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24)); | 
 | 97 |     } | 
 | 98 |   } | 
 | 99 |  | 
 | 100 |   inline uint64_t DecodeFixed64(const char* ptr) { | 
 | 101 |     if (! IS_BIG_ENDIAN) { | 
 | 102 |       // Load the raw bytes | 
 | 103 |       uint64_t result; | 
 | 104 |       memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load | 
 | 105 |       return result; | 
 | 106 |     } else { | 
 | 107 |       uint64_t lo = DecodeFixed32(ptr); | 
 | 108 |       uint64_t hi = DecodeFixed32(ptr + 4); | 
 | 109 |       return (hi << 32) | lo; | 
 | 110 |     } | 
 | 111 |   } | 
 | 112 |  | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 113 |   static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) { | 
 | 114 |     return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) )); | 
 | 115 |   } | 
 | 116 |  | 
| Marc Kupietz | ce0b8b0 | 2018-06-05 11:06:39 +0200 | [diff] [blame] | 117 |   // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf"> | 
 | 118 |   // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.  | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 119 |   static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) { | 
| Marc Kupietz | 8caf991 | 2018-06-05 10:51:18 +0200 | [diff] [blame] | 120 |     if(f12 == 0) | 
 | 121 |       return -1.0; | 
 | 122 |     else | 
 | 123 |       return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) )) / (-log2(((double) f12 / window_size / total))); | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 124 |   } | 
 | 125 |  | 
 | 126 |   // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. | 
 | 127 |   // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625 | 
 | 128 |   // double md = log2(pow((double)max * window_size / total, 2) /  (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total))); | 
 | 129 |   static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) { | 
 | 130 |     return log2((double)f12 * f12 /  ((double) total * window_size * window_size * f1 * f2)); | 
 | 131 |   } | 
 | 132 |  | 
 | 133 |   static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) { | 
| Marc Kupietz | 8caf991 | 2018-06-05 10:51:18 +0200 | [diff] [blame] | 134 |     if(f12 == 0) | 
 | 135 |       return 0; | 
 | 136 |     else | 
 | 137 |       return log2((double)f12 * f12 /  ((double) total * window_size * window_size * f1 * f2)) + log2((double) f12 / window_size / total); | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 138 |   } | 
 | 139 |  | 
| Marc Kupietz | bbd236e | 2019-01-21 16:50:19 +0100 | [diff] [blame] | 140 |   // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.  | 
 | 141 |   // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf | 
 | 142 |   static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) { | 
 | 143 |     double | 
 | 144 |       r1 = (double) w1 * window_size, | 
 | 145 |       r2 = (double) n - r1, | 
 | 146 |       c1 = w2, | 
 | 147 |       c2 = n - c1, | 
 | 148 |       o11 = w12,          o12 = r1 - o11, | 
 | 149 |       o21 = c1 - w12,     o22 = r2 - o21, | 
 | 150 |       e11 = r1 * c1 / n,  e12 = r1 * c2 / n, | 
 | 151 |       e21 = r2 * c1 / n,  e22 = r2 * c2 / n; | 
 | 152 |     return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0))); | 
 | 153 |   } | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 154 |  | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 155 |  | 
 | 156 |   static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) { | 
 | 157 |     double | 
 | 158 |       r1 = (double) w1 * window_size, | 
 | 159 |       c1 = w2; | 
 | 160 |     return 2 * w12 / (c1+r1); | 
 | 161 |   } | 
 | 162 |  | 
 | 163 |   // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9. | 
 | 164 |   static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) { | 
 | 165 |     double | 
 | 166 |       e = 0.5, | 
 | 167 |       r1 = (double) w1 * window_size, | 
 | 168 |       c1 = w2; | 
 | 169 |     return 14 + log2(2 * (w12+e) / (c1+e+r1+e)); | 
 | 170 |   } | 
 | 171 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 172 |   class CountMergeOperator : public AssociativeMergeOperator { | 
 | 173 |   public: | 
 | 174 |     CountMergeOperator() { | 
 | 175 |       mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 176 |     } | 
 | 177 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 178 |     virtual bool Merge(const Slice& key, | 
 | 179 |                        const Slice* existing_value, | 
 | 180 |                        const Slice& value, | 
 | 181 |                        std::string* new_value, | 
 | 182 |                        Logger* logger) const override { | 
 | 183 |       assert(new_value->empty()); | 
 | 184 |       ++num_merge_operator_calls; | 
 | 185 |       if (existing_value == nullptr) { | 
 | 186 |         new_value->assign(value.data(), value.size()); | 
 | 187 |         return true; | 
 | 188 |       } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 189 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 190 |       return mergeOperator_->PartialMerge( | 
 | 191 |                                           key, | 
 | 192 |                                           *existing_value, | 
 | 193 |                                           value, | 
 | 194 |                                           new_value, | 
 | 195 |                                           logger); | 
 | 196 |     } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 197 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 198 |     virtual const char* Name() const override { | 
 | 199 |       return "UInt64AddOperator"; | 
 | 200 |     } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 201 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 202 |   private: | 
 | 203 |     std::shared_ptr<MergeOperator> mergeOperator_; | 
 | 204 |   }; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 205 |  | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 206 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 207 |   class CollocatorIterator : public Iterator { | 
 | 208 |   private: | 
 | 209 |     char prefixc[sizeof(uint64_t)]; | 
 | 210 |     Iterator *base_iterator_; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 211 |  | 
 | 212 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 213 |   public: | 
 | 214 |     CollocatorIterator(Iterator* base_iterator) | 
 | 215 |       : base_iterator_(base_iterator) | 
 | 216 |     {} | 
 | 217 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 218 |     void setPrefix(char *prefix) { | 
 | 219 |       memcpy(prefixc, prefix, sizeof(uint64_t)); | 
 | 220 |     } | 
 | 221 |  | 
 | 222 |     virtual void SeekToFirst() { base_iterator_->SeekToFirst(); } | 
 | 223 |     virtual void SeekToLast() { base_iterator_->SeekToLast(); } | 
 | 224 |     virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); } | 
 | 225 |     virtual void Prev() { base_iterator_->Prev(); } | 
 | 226 |     virtual void Next() { base_iterator_->Next(); } | 
 | 227 |     virtual Slice key() const; | 
 | 228 |     virtual Slice value() const; | 
 | 229 |     virtual Status status() const; | 
 | 230 |     virtual bool Valid() const; | 
 | 231 |     bool isValid(); | 
 | 232 |     uint64_t intValue(); | 
 | 233 |     uint64_t intKey(); | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 234 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 235 |   }; | 
| Marc Kupietz | 18375e1 | 2017-12-24 10:11:18 +0100 | [diff] [blame] | 236 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 237 |   //  rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {} | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 238 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 239 |   bool rocksdb::CollocatorIterator::Valid() const { | 
| Marc Kupietz | 18375e1 | 2017-12-24 10:11:18 +0100 | [diff] [blame] | 240 |     return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3)); | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 241 |   } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 242 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 243 |   bool rocksdb::CollocatorIterator::isValid() { | 
 | 244 |     return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3)); | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 245 |     // return key().starts_with(std::string(prefixc,3)); | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 246 |   } | 
| Marc Kupietz | 18375e1 | 2017-12-24 10:11:18 +0100 | [diff] [blame] | 247 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 248 |   uint64_t rocksdb::CollocatorIterator::intKey() { | 
 | 249 |     return DecodeFixed64(base_iterator_->key().data()); | 
 | 250 |   } | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 251 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 252 |   uint64_t rocksdb::CollocatorIterator::intValue() { | 
 | 253 |     return DecodeFixed64(base_iterator_->value().data()); | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 254 |   } | 
 | 255 |  | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 256 |   class VocabEntry { | 
 | 257 |   public: | 
 | 258 |     string word; | 
 | 259 |     uint64_t freq; | 
 | 260 |   }; | 
 | 261 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 262 |   class CollocatorDB { | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 263 |   private: | 
 | 264 |     WriteOptions merge_option_; // for merge | 
 | 265 |     char _one[sizeof(uint64_t)]; | 
 | 266 |     Slice _one_slice; | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 267 |     vector<VocabEntry> _vocab; | 
| Marc Kupietz | 4ec51c1 | 2019-01-21 11:06:39 +0100 | [diff] [blame] | 268 |     uint64_t total = 0; | 
 | 269 |     uint64_t sentences = 0; | 
| Marc Kupietz | 8cf7e91 | 2019-01-21 17:05:23 +0100 | [diff] [blame] | 270 |     float avg_window_size = 8.0; | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 271 |      | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 272 |   protected: | 
 | 273 |     std::shared_ptr<DB> db_; | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 274 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 275 |     WriteOptions put_option_; | 
 | 276 |     ReadOptions get_option_; | 
 | 277 |     WriteOptions delete_option_; | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 278 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 279 |     uint64_t default_; | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 280 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 281 |     std::shared_ptr<DB> OpenDb(const char *dbname); | 
| Marc Kupietz | 6bb2776 | 2018-01-09 17:53:01 +0100 | [diff] [blame] | 282 |     std::shared_ptr<DB> OpenDbForRead(const char *dbname); | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 283 |     void read_vocab(string fname); | 
 | 284 |      | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 285 |   public: | 
| Marc Kupietz | 4a5e08a | 2018-06-05 11:07:11 +0200 | [diff] [blame] | 286 |     string getWord(uint32_t w1); | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 287 |     CollocatorDB(const char *db_name, bool read_only); | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 288 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 289 |     // public interface of CollocatorDB. | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 290 |     // All four functions return false | 
 | 291 |     // if the underlying level db operation failed. | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 292 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 293 |     // mapped to a levedb Put | 
 | 294 |     bool set(const std::string& key, uint64_t value) { | 
 | 295 |       // just treat the internal rep of int64 as the string | 
 | 296 |       char buf[sizeof(value)]; | 
 | 297 |       EncodeFixed64(buf, value); | 
 | 298 |       Slice slice(buf, sizeof(value)); | 
 | 299 |       auto s = db_->Put(put_option_, key, slice); | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 300 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 301 |       if (s.ok()) { | 
 | 302 |         return true; | 
 | 303 |       } else { | 
 | 304 |         std::cerr << s.ToString() << std::endl; | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 305 |         return false; | 
 | 306 |       } | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 307 |     } | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 308 |  | 
 | 309 |     DB *getDb() { | 
 | 310 |       return db_.get(); | 
 | 311 |     } | 
 | 312 |  | 
 | 313 |     // mapped to a rocksdb Delete | 
 | 314 |     bool remove(const std::string& key) { | 
 | 315 |       auto s = db_->Delete(delete_option_, key); | 
 | 316 |  | 
 | 317 |       if (s.ok()) { | 
 | 318 |         return true; | 
 | 319 |       } else { | 
 | 320 |         std::cerr << s.ToString() << std::endl; | 
 | 321 |         return false; | 
 | 322 |       } | 
 | 323 |     } | 
 | 324 |  | 
 | 325 |     // mapped to a rocksdb Get | 
 | 326 |     bool get(const std::string& key, uint64_t* value) { | 
 | 327 |       std::string str; | 
 | 328 |       auto s = db_->Get(get_option_, key, &str); | 
 | 329 |  | 
 | 330 |       if (s.IsNotFound()) { | 
 | 331 |         // return default value if not found; | 
 | 332 |         *value = default_; | 
 | 333 |         return true; | 
 | 334 |       } else if (s.ok()) { | 
 | 335 |         // deserialization | 
 | 336 |         if (str.size() != sizeof(uint64_t)) { | 
 | 337 |           std::cerr << "value corruption\n"; | 
 | 338 |           return false; | 
 | 339 |         } | 
 | 340 |         *value = DecodeFixed64(&str[0]); | 
 | 341 |         return true; | 
 | 342 |       } else { | 
 | 343 |         std::cerr << s.ToString() << std::endl; | 
 | 344 |         return false; | 
 | 345 |       } | 
 | 346 |     } | 
 | 347 |  | 
 | 348 |  | 
 | 349 |     uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) { | 
 | 350 |       char encoded_key[sizeof(uint64_t)]; | 
 | 351 |       EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist)); | 
 | 352 |       uint64_t value = default_; | 
 | 353 |       get(std::string(encoded_key, 8), &value); | 
 | 354 |       return value; | 
 | 355 |     } | 
 | 356 |  | 
 | 357 |     virtual void inc(const std::string& key) { | 
 | 358 |       db_->Merge(merge_option_, key, _one_slice); | 
 | 359 |     } | 
 | 360 |  | 
 | 361 |     void inc(const uint64_t key) { | 
 | 362 |       char encoded_key[sizeof(uint64_t)]; | 
 | 363 |       EncodeFixed64(encoded_key, key); | 
 | 364 |       db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice); | 
 | 365 |     } | 
 | 366 |  | 
 | 367 |     virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist); | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 368 |     void dump(uint32_t w1, uint32_t w2, int8_t dist); | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 369 |     vector<Collocator> get_collocators(uint32_t w1); | 
| Marc Kupietz | bd96619 | 2018-10-13 14:14:37 +0200 | [diff] [blame] | 370 |     vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2); | 
| Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 371 |     void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur); | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 372 |     string collocators2json(vector<Collocator> collocators); | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 373 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 374 |     // mapped to a rocksdb Merge operation | 
 | 375 |     virtual bool add(const std::string& key, uint64_t value) { | 
 | 376 |       char encoded[sizeof(uint64_t)]; | 
 | 377 |       EncodeFixed64(encoded, value); | 
 | 378 |       Slice slice(encoded, sizeof(uint64_t)); | 
 | 379 |       auto s = db_->Merge(merge_option_, key, slice); | 
 | 380 |  | 
 | 381 |       if (s.ok()) { | 
 | 382 |         return true; | 
 | 383 |       } else { | 
 | 384 |         std::cerr << s.ToString() << std::endl; | 
 | 385 |         return false; | 
 | 386 |       } | 
 | 387 |     } | 
 | 388 |  | 
 | 389 |     CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist); | 
 | 390 |   }; | 
 | 391 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 392 |   rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) { | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 393 | 		//		merge_option_.sync = true; | 
| Marc Kupietz | 6bb2776 | 2018-01-09 17:53:01 +0100 | [diff] [blame] | 394 |     if(read_only) | 
 | 395 |       db_ = OpenDbForRead(db_name); | 
 | 396 |     else | 
 | 397 |       db_ = OpenDb(db_name); | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 398 |     assert(db_); | 
 | 399 |     uint64_t one = 1; | 
 | 400 |     EncodeFixed64(_one, one); | 
 | 401 |     _one_slice = Slice(_one, sizeof(uint64_t)); | 
 | 402 |   } | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 403 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 404 |   void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) { | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 405 |     inc(encodeCollocation(w1, w2, dist)); | 
 | 406 |   } | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 407 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 408 |   void rocksdb::CollocatorDB::read_vocab(string fname) { | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 409 |     char strbuf[2048]; | 
 | 410 |     uint64_t freq; | 
 | 411 |     FILE *fin = fopen(fname.c_str(), "rb"); | 
 | 412 |     if (fin == NULL) { | 
 | 413 |       cout <<  "Vocabulary file " << fname <<" not found\n"; | 
 | 414 |       exit(1); | 
 | 415 |     } | 
 | 416 |     uint64_t i = 0; | 
 | 417 |     while(!feof(fin)) { | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 418 |       fscanf(fin, "%s %lu", strbuf, &freq); | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 419 |       _vocab.push_back({strbuf, freq}); | 
 | 420 |       total += freq; | 
 | 421 |       i++; | 
 | 422 |     } | 
 | 423 |     fclose(fin); | 
| Marc Kupietz | 4ec51c1 | 2019-01-21 11:06:39 +0100 | [diff] [blame] | 424 |  | 
 | 425 |     char size_fname[256]; | 
 | 426 |     strcpy(size_fname, fname.c_str()); | 
 | 427 |     char *pos = strstr(size_fname, ".vocab"); | 
 | 428 |     if(pos) { | 
 | 429 |       *pos=0; | 
 | 430 |       strcat(size_fname, ".size"); | 
 | 431 |       FILE *fp = fopen(size_fname, "r"); | 
 | 432 |       if (fp != NULL) { | 
 | 433 |         fscanf(fp, "%lu", &sentences); | 
 | 434 |         fscanf(fp, "%lu", &total); | 
 | 435 |         float sl = (float)total/(float)sentences; | 
 | 436 |         float w = WINDOW_SIZE; | 
 | 437 |         avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl; | 
 | 438 |         fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size); | 
 | 439 |         fclose(fp); | 
 | 440 |       } else { | 
 | 441 |         std::cout <<  "size file " << size_fname << " not found\n"; | 
 | 442 |       } | 
 | 443 |     } else { | 
 | 444 |       std::cout <<  "cannot determine size file " << size_fname << "\n"; | 
 | 445 |     } | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 446 |   } | 
 | 447 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 448 |   std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) { | 
| Marc Kupietz | 6bb2776 | 2018-01-09 17:53:01 +0100 | [diff] [blame] | 449 | 		DB* db; | 
 | 450 | 		Options options; | 
| Marc Kupietz | 0dd86ef | 2018-01-11 22:23:17 +0100 | [diff] [blame] | 451 | 		options.env->SetBackgroundThreads(4); | 
 | 452 | 		options.create_if_missing = true; | 
 | 453 | 		options.merge_operator = std::make_shared<CountMergeOperator>(); | 
 | 454 | 		options.max_successive_merges = 0; | 
 | 455 |     //		options.prefix_extractor.reset(NewFixedPrefixTransform(8)); | 
 | 456 | 		options.IncreaseParallelism(); | 
 | 457 |     options.OptimizeLevelStyleCompaction(); | 
 | 458 |     options.prefix_extractor.reset(NewFixedPrefixTransform(3)); | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 459 |     ostringstream dbname, vocabname; | 
| Marc Kupietz | 6bb2776 | 2018-01-09 17:53:01 +0100 | [diff] [blame] | 460 |     dbname << name << ".rocksdb"; | 
 | 461 | 		auto s = DB::OpenForReadOnly(options, dbname.str(), &db); | 
 | 462 | 		if (!s.ok()) { | 
 | 463 | 			std::cerr << s.ToString() << std::endl; | 
 | 464 | 			assert(false); | 
 | 465 | 		} | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 466 |     vocabname << name << ".vocab"; | 
 | 467 |     read_vocab(vocabname.str()); | 
| Marc Kupietz | 6bb2776 | 2018-01-09 17:53:01 +0100 | [diff] [blame] | 468 | 		return std::shared_ptr<DB>(db); | 
 | 469 |   } | 
 | 470 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 471 |   std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) { | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 472 | 		DB* db; | 
 | 473 | 		Options options; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 474 |  | 
 | 475 |  | 
 | 476 | 		options.env->SetBackgroundThreads(4); | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 477 | 		options.create_if_missing = true; | 
 | 478 | 		options.merge_operator = std::make_shared<CountMergeOperator>(); | 
 | 479 | 		options.max_successive_merges = 0; | 
| Marc Kupietz | 0dd86ef | 2018-01-11 22:23:17 +0100 | [diff] [blame] | 480 |     //		options.prefix_extractor.reset(NewFixedPrefixTransform(8)); | 
 | 481 | 		options.IncreaseParallelism(); | 
 | 482 |     options.OptimizeLevelStyleCompaction(); | 
 | 483 |     // options.max_write_buffer_number = 48; | 
 | 484 |     // options.max_background_jobs = 48; | 
 | 485 |     // options.allow_concurrent_memtable_write=true; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 486 | 		//		options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000)); | 
 | 487 | 		// options.enable_write_thread_adaptive_yield = 1; | 
 | 488 | 		// options.allow_concurrent_memtable_write = 1; | 
 | 489 | 		// options.memtable_factory.reset(new rocksdb::SkipListFactory); | 
 | 490 | 		// options.write_buffer_size = 1 << 22; | 
 | 491 | 		// options.allow_mmap_reads = true; | 
 | 492 | 		// options.allow_mmap_writes = true; | 
| Marc Kupietz | 0dd86ef | 2018-01-11 22:23:17 +0100 | [diff] [blame] | 493 | 		// options.max_background_compactions = 40; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 494 |     // BlockBasedTableOptions table_options; | 
 | 495 |     // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false)); | 
 | 496 | 		// options.bloom_locality = 1; | 
 | 497 |     // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024); | 
 | 498 |     // table_options.block_cache = cache; | 
 | 499 | 		// options.table_factory.reset(NewBlockBasedTableFactory(table_options)); | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 500 | 		Status s; | 
 | 501 | 		//  DestroyDB(dbname, Options()); | 
 | 502 | 		s = DB::Open(options, dbname, &db); | 
 | 503 | 		if (!s.ok()) { | 
 | 504 | 			std::cerr << s.ToString() << std::endl; | 
 | 505 | 			assert(false); | 
 | 506 | 		} | 
 | 507 | 		return std::shared_ptr<DB>(db); | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 508 |   } | 
 | 509 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 510 |   CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) { | 
| Marc Kupietz | 18375e1 | 2017-12-24 10:11:18 +0100 | [diff] [blame] | 511 |     ReadOptions options; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 512 |     options.prefix_same_as_start = true; | 
| Marc Kupietz | 18375e1 | 2017-12-24 10:11:18 +0100 | [diff] [blame] | 513 |     char prefixc[sizeof(uint64_t)]; | 
 | 514 |     EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist)); | 
 | 515 |     Iterator *it = db_->NewIterator(options); | 
 | 516 |     CollocatorIterator *cit = new CollocatorIterator(it); | 
 | 517 |     cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) { | 
 | 518 |     cit->setPrefix(prefixc); | 
 | 519 |     return cit; | 
 | 520 |   } | 
 | 521 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 522 | 	void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) { | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 523 |     auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist)); | 
 | 524 |     for (; it->isValid(); it->Next()) { | 
 | 525 |       uint64_t value = it->intValue(); | 
 | 526 |       uint64_t key = it->intKey(); | 
 | 527 |       std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl; | 
 | 528 |     } | 
 | 529 |     std::cout << "ready dumping\n"; | 
 | 530 |   } | 
 | 531 |  | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 532 | 	bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; } | 
 | 533 | 	bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; } | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 534 | 	bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; } | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 535 |  | 
| Marc Kupietz | bd96619 | 2018-10-13 14:14:37 +0200 | [diff] [blame] | 536 | 	std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) { | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 537 | 		std::vector<Collocator> collocators; | 
 | 538 |     uint64_t w2, last_w2 = 0xffffffffffffffff; | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 539 |     uint64_t maxv = 0, sum = 0, left = 0, right = 0; | 
 | 540 |  | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 541 |     for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) { | 
 | 542 |       uint64_t value = it->intValue(), | 
 | 543 |         key = it->intKey(); | 
| Marc Kupietz | bd96619 | 2018-10-13 14:14:37 +0200 | [diff] [blame] | 544 |       if((w2 = W2(key)) > max_w2) | 
 | 545 |         continue; | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 546 |       if(last_w2 == 0xffffffffffffffff) last_w2 = w2; | 
 | 547 |       if (w2 != last_w2) { | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 548 |         if(sum >= FREQUENCY_THRESHOLD) { | 
 | 549 |           double o = sum, | 
 | 550 |             r1 = (double)_vocab[w1].freq * avg_window_size, | 
 | 551 |             c1 = (double)_vocab[last_w2].freq, | 
 | 552 |             e = r1 * c1 / total, | 
 | 553 |             pmi = log2(o/e), | 
 | 554 |             md = log2(o*o/e), | 
 | 555 |             lfmd = log2(o*o*o/e), | 
 | 556 |             llr = ca_ll((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size); | 
 | 557 |           double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1); | 
 | 558 |           double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1); | 
 | 559 |           double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1); | 
 | 560 |           double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1); | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 561 |           collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(o/total/avg_window_size)), /* normalize to [-1,1] */ | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 562 |                 llr, lfmd, md, | 
 | 563 |                 left_lfmd, | 
 | 564 |                 right_lfmd, | 
 | 565 |                 left_npmi, | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 566 |                 right_npmi, | 
 | 567 |                 ca_dice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size), | 
 | 568 |                 ca_logdice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size) | 
 | 569 |                 } | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 570 |             ); | 
 | 571 |         } | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 572 |         last_w2 = w2; | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 573 |         maxv = value; | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 574 |         sum = value; | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 575 |       } else { | 
| Marc Kupietz | 98cbcdc | 2019-01-21 17:11:27 +0100 | [diff] [blame] | 576 |         sum += value; | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 577 |         if(value > maxv) | 
 | 578 |           maxv = value; | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 579 |       } | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 580 |       if(DIST(key) == -1) | 
 | 581 |         left = value; | 
 | 582 |       else if(DIST(key) == 1) | 
 | 583 |         right = value; | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 584 |     } | 
 | 585 |  | 
 | 586 | 		sort(collocators.begin(), collocators.end(), sortByLfmd); | 
 | 587 | 		 | 
| Marc Kupietz | 0779a20 | 2018-06-05 11:13:35 +0200 | [diff] [blame] | 588 |     /* | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 589 |     int i=0; | 
 | 590 |     for (Collocator c : collocators) { | 
 | 591 |       if(i++>10) break; | 
 | 592 |       std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word | 
 | 593 |                 << "\t f(w1):" << _vocab[w1].freq | 
 | 594 |                 << "\t f(w2):" << _vocab[c.w2].freq | 
 | 595 |                 << "\t f(w1, x):" << total_w1 | 
| Marc Kupietz | 51f9379 | 2018-01-25 08:51:01 +0100 | [diff] [blame] | 596 |                 << "\t f(w1, w2):" << c.raw | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 597 |                 << "\t pmi:" << c.pmi | 
 | 598 |                 << "\t npmi:" << c.npmi | 
 | 599 |                 << "\t llr:" << c.llr | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 600 |                 << "\t lfmd:" << c.lfmd | 
 | 601 |                 << "\t fpmi:" << c.fpmi | 
 | 602 |                 << "\t total:" << total | 
 | 603 |                 << std::endl; | 
 | 604 |     } | 
| Marc Kupietz | 0779a20 | 2018-06-05 11:13:35 +0200 | [diff] [blame] | 605 |     */ | 
| Marc Kupietz | d31254c | 2018-01-20 21:29:30 +0100 | [diff] [blame] | 606 | 		return collocators; | 
 | 607 |   } | 
 | 608 |  | 
| Marc Kupietz | bd96619 | 2018-10-13 14:14:37 +0200 | [diff] [blame] | 609 | 	std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) { | 
 | 610 |     return get_collocators(w1, UINT32_MAX); | 
 | 611 |   } | 
 | 612 |  | 
| Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 613 |   void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) { | 
 | 614 | 		std::vector<Collocator> collocators; | 
 | 615 |     std::stringstream stream; | 
 | 616 |     uint64_t w2, last_w2 = 0xffffffffffffffff; | 
 | 617 |     uint64_t maxv = 0, total_w1 = 0; | 
 | 618 |     bool first = true; | 
 | 619 |     for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) { | 
 | 620 |       uint64_t value = it->intValue(), | 
 | 621 |         key = it->intKey(); | 
 | 622 |       w2 = W2(key); | 
 | 623 |       total_w1 += value; | 
 | 624 |       if(last_w2 == 0xffffffffffffffff) last_w2 = w2; | 
 | 625 |       if (w2 != last_w2) { | 
 | 626 |         if(maxv >= min_cooccur) { | 
| Marc Kupietz | bbd236e | 2019-01-21 16:50:19 +0100 | [diff] [blame] | 627 |           double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq,  maxv, total, 1); | 
| Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 628 |           if(first) | 
 | 629 |             first = false; | 
 | 630 |           else | 
 | 631 |            stream << " "; | 
 | 632 |           stream << w2  << " " << llr; | 
 | 633 |         } | 
 | 634 |         last_w2 = w2; | 
 | 635 |         maxv = value; | 
 | 636 |       } else { | 
 | 637 |         if(value > maxv) | 
 | 638 |           maxv = value; | 
 | 639 |       } | 
 | 640 |     } | 
 | 641 |     if(first) | 
 | 642 |       stream  << "1 0.0"; | 
 | 643 |     stream  << "\n"; | 
 | 644 |     std::cout << stream.str(); | 
 | 645 |   } | 
 | 646 |  | 
| Marc Kupietz | 4b799e9 | 2018-01-02 11:04:56 +0100 | [diff] [blame] | 647 |   rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); } | 
 | 648 |   rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); } | 
 | 649 |   rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); } | 
 | 650 |  | 
| Marc Kupietz | 28cc53e | 2017-12-23 17:24:55 +0100 | [diff] [blame] | 651 | }; | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 652 |  | 
| Marc Kupietz | 4a5e08a | 2018-06-05 11:07:11 +0200 | [diff] [blame] | 653 | string rocksdb::CollocatorDB::getWord(uint32_t w1) { | 
 | 654 |   return _vocab[w1].word; | 
 | 655 | } | 
 | 656 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 657 | string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) { | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 658 |   ostringstream s; | 
| Marc Kupietz | 0dd86ef | 2018-01-11 22:23:17 +0100 | [diff] [blame] | 659 |   int i = 0; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 660 |   s << "["; | 
 | 661 |   bool first = true; | 
 | 662 |   for (Collocator c : collocators) { | 
| Marc Kupietz | b999ec5 | 2018-06-05 11:20:46 +0200 | [diff] [blame] | 663 |     if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue; | 
| Marc Kupietz | 0dd86ef | 2018-01-11 22:23:17 +0100 | [diff] [blame] | 664 |     if (i++ > 200) | 
 | 665 |       break; | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 666 |     if(!first) | 
 | 667 |       s << ",\n"; | 
 | 668 |     else | 
 | 669 |       first = false; | 
 | 670 |     s << "{" | 
 | 671 |       "\"word\":\"" << string(_vocab[c.w2].word) << "\"," << | 
 | 672 |       "\"rank\":" << c.w2    << "," << | 
| Marc Kupietz | 51f9379 | 2018-01-25 08:51:01 +0100 | [diff] [blame] | 673 |       "\"f\":" << c.raw    << "," << | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 674 |       "\"npmi\":" << c.npmi  << "," << | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 675 |       "\"pmi\":" << c.pmi  << "," << | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 676 |       "\"llr\":" << c.llr   << "," << | 
 | 677 |       "\"lfmd\":" << c.lfmd  << "," << | 
| Marc Kupietz | 4188045 | 2019-01-22 15:29:06 +0100 | [diff] [blame^] | 678 |       "\"md\":" << c.md  << "," << | 
 | 679 |       "\"dice\":" << c.dice  << "," << | 
 | 680 |       "\"ld\":" << c.logdice  << "," << | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 681 |       "\"llfmd\":" << c.left_lfmd  << "," << | 
 | 682 |       "\"rlfmd\":" << c.right_lfmd  << "," << | 
 | 683 |       "\"lnpmi\":" << c.left_npmi  << "," << | 
 | 684 |       "\"rnpmi\":" << c.right_npmi  << | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 685 |       "}"; | 
 | 686 |   } | 
 | 687 |   s << "]\n"; | 
| Marc Kupietz | 8e0ebea | 2018-01-24 09:53:26 +0100 | [diff] [blame] | 688 |   //  cout << s.str(); | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 689 |   return s.str(); | 
 | 690 | } | 
 | 691 |  | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 692 | typedef rocksdb::CollocatorDB COLLOCATORS; | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 693 |  | 
 | 694 | extern "C" { | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 695 | 	COLLOCATORS *open_collocatordb_for_write(char *dbname) { | 
 | 696 | 		return new rocksdb::CollocatorDB(dbname, false); | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 697 | 	} | 
 | 698 | 	 | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 699 | 	COLLOCATORS *open_collocatordb(char *dbname) { | 
 | 700 | 		return new rocksdb::CollocatorDB(dbname, true); | 
| Marc Kupietz | 6bb2776 | 2018-01-09 17:53:01 +0100 | [diff] [blame] | 701 | 	} | 
 | 702 | 	 | 
| Marc Kupietz | 6aec768 | 2018-01-10 09:47:48 +0100 | [diff] [blame] | 703 | 	void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) { | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 704 | 		db->inc(w1, w2, dist); | 
 | 705 | 	} | 
 | 706 |  | 
 | 707 | 	void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) { | 
 | 708 | 		db->dump(w1, w2, dist); | 
 | 709 | 	} | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 710 |  | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 711 | 	void get_collocators(COLLOCATORS *db, uint32_t w1) { | 
 | 712 | 		db->get_collocators(w1); | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 713 | 	} | 
 | 714 |  | 
| Marc Kupietz | ca3a52e | 2018-06-05 14:16:23 +0200 | [diff] [blame] | 715 | 	const char *get_word(COLLOCATORS *db, uint32_t w) { | 
 | 716 | 		return db->getWord(w).c_str(); | 
 | 717 | 	} | 
 | 718 |  | 
| Marc Kupietz | 37359b1 | 2018-01-09 21:11:37 +0100 | [diff] [blame] | 719 | 	const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) { | 
 | 720 | 		return strdup(db->collocators2json(db->get_collocators(w1)).c_str()); | 
| Marc Kupietz | c8ddf45 | 2018-01-07 21:33:12 +0100 | [diff] [blame] | 721 | 	} | 
| Marc Kupietz | 06c9a9f | 2018-01-02 16:56:43 +0100 | [diff] [blame] | 722 | } |