collocatordb: use sums and a frequency threshold of 5 for CA
... sums instead of maximum
next todo: implement auto-focus
diff --git a/collocatordb.cc b/collocatordb.cc
index 06ba8c3..eff9c23 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -23,6 +23,7 @@
#include "merge_operators.h"
#define WINDOW_SIZE 5.0
+#define FREQUENCY_THRESHOLD 5
#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
#define W1(key) (uint64_t)(key & 0xffffff)
@@ -516,8 +517,8 @@
std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
- uint64_t maxv = 0, left = 0, right = 0;
- const double window_size = 1;
+ uint64_t maxv = 0, sum = 0, left = 0, right = 0;
+
for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
uint64_t value = it->intValue(),
key = it->intKey();
@@ -525,22 +526,32 @@
continue;
if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
if (w2 != last_w2) {
- double pmi = ca_pmi(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, window_size);
- double lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, window_size);
- double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
- double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
- double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
- double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
- collocators.push_back ( {last_w2, maxv, pmi, pmi / (-log2(((double) maxv / window_size / total))), /* normalize to [-1,1] */
- calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq), lfmd, pmi*maxv/total/window_size,
- left_lfmd,
- right_lfmd,
- left_npmi,
- right_npmi}
- );
+ if(sum >= FREQUENCY_THRESHOLD) {
+ double o = sum,
+ r1 = (double)_vocab[w1].freq * avg_window_size,
+ c1 = (double)_vocab[last_w2].freq,
+ e = r1 * c1 / total,
+ pmi = log2(o/e),
+ md = log2(o*o/e),
+ lfmd = log2(o*o*o/e),
+ llr = ca_ll((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size);
+ double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
+ double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
+ double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
+ double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
+ collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(o)), /* normalize to [-1,1] */
+ llr, lfmd, md,
+ left_lfmd,
+ right_lfmd,
+ left_npmi,
+ right_npmi}
+ );
+ }
last_w2 = w2;
maxv = value;
+ sum = value;
} else {
+ sum += value;
if(value > maxv)
maxv = value;
}