collocatordb: add support for offline similar profiles calculation
./dumpllr ../Analysemethoden/word2vec/models/dereko-2017-ii > dereko.llr
python3 ccdbknn.py dereko.llr > dereko.sprofiles
diff --git a/collocatordb.cc b/collocatordb.cc
index ccddecb..faedad0 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -323,6 +323,7 @@
virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
void dump(uint32_t w1, uint32_t w2, int8_t dist);
vector<Collocator> get_collocators(uint32_t w1);
+ void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
vector<Collocator> get_collocators_avg(uint32_t w1);
string collocators2json(vector<Collocator> collocators);
@@ -612,6 +613,40 @@
return collocators;
}
+ void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
+ std::vector<Collocator> collocators;
+ std::stringstream stream;
+ uint64_t w2, last_w2 = 0xffffffffffffffff;
+ uint64_t maxv = 0, total_w1 = 0;
+ bool first = true;
+ for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
+ uint64_t value = it->intValue(),
+ key = it->intKey();
+ w2 = W2(key);
+ total_w1 += value;
+ if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
+ if (w2 != last_w2) {
+ if(maxv >= min_cooccur) {
+ double llr = calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq);
+ if(first)
+ first = false;
+ else
+ stream << " ";
+ stream << w2 << " " << llr;
+ }
+ last_w2 = w2;
+ maxv = value;
+ } else {
+ if(value > maxv)
+ maxv = value;
+ }
+ }
+ if(first)
+ stream << "1 0.0";
+ stream << "\n";
+ std::cout << stream.str();
+ }
+
rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }