derekovecs/collocatordb: default order by Log-Dice
I agree with Adam Kilgarriff, Pavel Rychlý and Bryan Jurish. Log-Dice
typically gives a useful score, particulraly for our large corpora. It
neither overemphasizes high frequency collocates (like LLR) nor low
frequency ones (like MI) and does so a little better than MI³, however
not always better than MI².
diff --git a/collocatordb.cc b/collocatordb.cc
index 2fbed98..8234739 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -552,6 +552,7 @@
bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
+ bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdoce > rhs.logdice; }
std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
std::vector<Collocator> collocators;
@@ -603,7 +604,7 @@
right = value;
}
- sort(collocators.begin(), collocators.end(), sortByLfmd);
+ sort(collocators.begin(), collocators.end(), sortByLogDice);
/*
int i=0;