derekovecs/collocatordb: add auto-focus to collocation analysis
Use the sub-window that yields the highest score – currently only for log-dice.
diff --git a/collocatordb.cc b/collocatordb.cc
index 320f0b7..bb505dd 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -22,7 +22,7 @@
#include "rocksdb/filter_policy.h"
#include "merge_operators.h"
-#define WINDOW_SIZE 5.0
+#define WINDOW_SIZE 5
#define FREQUENCY_THRESHOLD 5
#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
@@ -59,6 +59,8 @@
double right_npmi;
double dice;
double logdice;
+ double af;
+ int window;
};
size_t num_merge_operator_calls;
@@ -554,10 +556,11 @@
bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
- std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
- std::vector<Collocator> collocators;
+ std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
+ std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
uint64_t maxv = 0, sum = 0, left = 0, right = 0;
+ uint64_t sumWindow[2*WINDOW_SIZE+1] = {};
for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
uint64_t value = it->intValue(),
@@ -566,7 +569,8 @@
continue;
if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
if (w2 != last_w2) {
- if(sum >= FREQUENCY_THRESHOLD) {
+ if (sum >= FREQUENCY_THRESHOLD) {
+ uint64_t f1 = _vocab[w1].freq, f2 = _vocab[last_w2].freq;
double o = sum,
r1 = (double)_vocab[w1].freq * avg_window_size,
c1 = (double)_vocab[last_w2].freq,
@@ -574,22 +578,42 @@
pmi = log2(o/e),
md = log2(o*o/e),
lfmd = log2(o*o*o/e),
- llr = ca_ll((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size);
- double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
- double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
- double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
- double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
+ llr = ca_ll(f1, f2, sum, total, avg_window_size);
+ double left_lfmd = ca_lfmd(f1, f2, left, total, 1);
+ double right_lfmd = ca_lfmd(f1, f2, right, total, 1);
+ double left_npmi = ca_npmi(f1, f2, left, total, 1);
+ double right_npmi = ca_npmi(f1, f2, right, total, 1);
+
+ int bestWindow = (1 << (2*WINDOW_SIZE)) - 1;
+ double bestAF = ca_logdice(f1, f2, sum, total, 2*WINDOW_SIZE);
+ double currentAF;
+ for (int bitmask=1; bitmask < (1 << (2*WINDOW_SIZE)); bitmask++) {
+ uint64_t currentWindowSum=0;
+ for (int pos=0; pos < 2*WINDOW_SIZE; pos++) {
+ if (((1<<pos) & bitmask) != 0)
+ currentWindowSum+=sumWindow[pos];
+ }
+ currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
+ if(currentAF > bestAF) {
+ bestAF = currentAF;
+ bestWindow = bitmask;
+ }
+ }
collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(o/total/avg_window_size)), /* normalize to [-1,1] */
llr, lfmd, md,
left_lfmd,
right_lfmd,
left_npmi,
right_npmi,
- ca_dice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size),
- ca_logdice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size)
+ ca_dice(f1, f2, sum, total, avg_window_size),
+ ca_logdice(f1, f2, sum, total, avg_window_size),
+ bestAF,
+ bestWindow
}
);
}
+ memset(sumWindow, 0, 2*WINDOW_SIZE * sizeof(uint64_t));
+ sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
last_w2 = w2;
maxv = value;
sum = value;
@@ -597,6 +621,7 @@
sum += value;
if(value > maxv)
maxv = value;
+ sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
}
if(DIST(key) == -1)
left = value;
@@ -702,7 +727,9 @@
"\"llfmd\":" << c.left_lfmd << "," <<
"\"rlfmd\":" << c.right_lfmd << "," <<
"\"lnpmi\":" << c.left_npmi << "," <<
- "\"rnpmi\":" << c.right_npmi <<
+ "\"rnpmi\":" << c.right_npmi << "," <<
+ "\"af\":" << c.af << "," <<
+ "\"win\":" << c.window <<
"}";
}
s << "]\n";