collocatordb: use max resp. instead of avg. window for ca
diff --git a/collocatordb.cc b/collocatordb.cc
index 9ecbf63..7aca8a5 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -11,7 +11,7 @@
#include <string>
#include <sstream> // for ostringstream
#include <math.h>
-#include "rocksdb/cache.h"
+#include <rocksdb/cache.h>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
@@ -179,6 +179,7 @@
bool rocksdb::CollocatorIterator::isValid() {
return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
+ // return key().starts_with(std::string(prefixc,3));
}
uint64_t rocksdb::CollocatorIterator::intKey() {
@@ -300,6 +301,7 @@
virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
void dump(uint32_t w1, uint32_t w2, int8_t dist);
vector<Collocator> get_collocators(uint32_t w1);
+ vector<Collocator> get_collocators_avg(uint32_t w1);
string collocators2json(vector<Collocator> collocators);
// mapped to a rocksdb Merge operation
@@ -346,7 +348,7 @@
}
uint64_t i = 0;
while(!feof(fin)) {
- fscanf(fin, "%s %" PRIu64, strbuf, &freq);
+ fscanf(fin, "%s %lu", strbuf, &freq);
_vocab.push_back({strbuf, freq});
total += freq;
i++;
@@ -482,8 +484,9 @@
bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
+ bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
- std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
+ std::vector<Collocator> rocksdb::CollocatorDB::get_collocators_avg(uint32_t w1) {
std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
uint64_t sum = 0, total_w1 = 0;
@@ -530,6 +533,56 @@
return collocators;
}
+ std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
+ std::vector<Collocator> collocators;
+ uint64_t w2, last_w2 = 0xffffffffffffffff;
+ uint64_t max = 0, total_w1 = 0;
+ const double window_size = 1;
+
+ for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
+ uint64_t value = it->intValue(),
+ key = it->intKey();
+ w2 = W2(key);
+ total_w1 += value;
+ if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
+ if (w2 != last_w2) {
+ double pmi = log2( total * ((double) max) /
+ (window_size * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
+ // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
+ // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
+ double md = log2((double)max * max / ((double) total * window_size * window_size * _vocab[w1].freq * _vocab[last_w2].freq));
+ collocators.push_back ( {last_w2, max, pmi, pmi / (-log2(((double) max / window_size / total))), /* normalize to [-1,1] */
+ calculateLLR(_vocab[w1].freq, total, max, _vocab[last_w2].freq), md, md + log2((double)max / window_size / total), pmi*max/total/window_size} );
+ last_w2 = w2;
+ max = value;
+ } else {
+ if(value > max)
+ max = value;
+ }
+ }
+
+ sort(collocators.begin(), collocators.end(), sortByLfmd);
+
+ int i=0;
+ for (Collocator c : collocators) {
+ if(i++>10) break;
+ std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
+ << "\t f(w1):" << _vocab[w1].freq
+ << "\t f(w2):" << _vocab[c.w2].freq
+ << "\t f(w1, x):" << total_w1
+ << "\t f(w1, w2):" << c.sum
+ << "\t pmi:" << c.pmi
+ << "\t npmi:" << c.npmi
+ << "\t llr:" << c.llr
+ << "\t md:" << c.md
+ << "\t lfmd:" << c.lfmd
+ << "\t fpmi:" << c.fpmi
+ << "\t total:" << total
+ << std::endl;
+ }
+ return collocators;
+ }
+
rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }