collocatordb: factor out ca calculation to test parallelization
diff --git a/collocatordb.cc b/collocatordb.cc
index 9245304..34fb88b 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -46,7 +46,7 @@
namespace rocksdb {
class Collocator {
public:
- uint64_t w2;
+ uint32_t w2;
uint64_t f2;
uint64_t raw;
double pmi;
@@ -392,6 +392,8 @@
void dump(uint32_t w1, uint32_t w2, int8_t dist);
vector<Collocator> get_collocators(uint32_t w1);
vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
+ void applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow, const uint64_t sum, const int usedPositions, int true_window_size, rocksdb::Collocator *result);
+
void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
string collocators2json(vector<Collocator> collocators);
@@ -558,14 +560,67 @@
bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
+
+ void rocksdb::CollocatorDB::applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
+ const uint64_t sum, const int usedPositions, int true_window_size, rocksdb::Collocator *result) {
+ uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
+ double o = sum,
+ r1 = f1 * true_window_size,
+ c1 = f2,
+ e = r1 * c1 / total,
+ pmi = log2(o/e),
+ md = log2(o*o/e),
+ lfmd = log2(o*o*o/e),
+ llr = ca_ll(f1, f2, sum, total, true_window_size);
+ double ld = ca_logdice(f1, f2, sum, total, true_window_size);
+
+ int bestWindow = usedPositions;
+ double bestAF = ld;
+ double currentAF;
+ // if(f1<75000000)
+ //#pragma omp parallel for reduction(max:bestAF)
+ for (int bitmask=1; bitmask < (1 << (2*WINDOW_SIZE)); bitmask++) {
+ if((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
+ uint64_t currentWindowSum=0;
+ //#pragma omp parallel for reduction(+:currentWindowSum)
+ for (int pos=0; pos < 2*WINDOW_SIZE; pos++) {
+ if (((1<<pos) & bitmask & usedPositions) != 0)
+ currentWindowSum+=sumWindow[pos];
+ }
+ currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
+ if(currentAF > bestAF) {
+ bestAF = currentAF;
+ bestWindow = bitmask;
+ }
+ }
+
+ *result = {w2, f2, sum,
+ pmi, pmi / (-log2(o/total/true_window_size)),
+ llr, lfmd, md,
+ 0,
+ 0,
+ 0,
+ 0,
+ ca_dice(f1, f2, sum, total, true_window_size),
+ ld,
+ bestAF,
+ usedPositions,
+ bestWindow
+ };
+
+ }
+
std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
std::vector<Collocator> collocators;
uint64_t w2, last_w2 = 0xffffffffffffffff;
- uint64_t maxv = 0, sum = 0, left = 0, right = 0;
- uint64_t sumWindow[2*WINDOW_SIZE+1] = {};
+ uint64_t maxv = 0, sum = 0;
+ uint64_t *sumWindow = (uint64_t*) malloc(sizeof(uint64_t)*2*WINDOW_SIZE);
+ memset(sumWindow, 0, sizeof(uint64_t)*2*WINDOW_SIZE);
int true_window_size = 1;
int usedPositions=0;
+#pragma omp parallel num_threads(40)
+#pragma omp single
for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
uint64_t value = it->intValue(),
key = it->intKey();
@@ -574,52 +629,15 @@
if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
if (w2 != last_w2) {
if (sum >= FREQUENCY_THRESHOLD) {
- uint64_t f1 = _vocab[w1].freq, f2 = _vocab[last_w2].freq;
- double o = sum,
- r1 = (double)_vocab[w1].freq * true_window_size,
- c1 = (double)_vocab[last_w2].freq,
- e = r1 * c1 / total,
- pmi = log2(o/e),
- md = log2(o*o/e),
- lfmd = log2(o*o*o/e),
- llr = ca_ll(f1, f2, sum, total, true_window_size);
- double left_lfmd = ca_lfmd(f1, f2, left, total, 1);
- double right_lfmd = ca_lfmd(f1, f2, right, total, 1);
- double left_npmi = ca_npmi(f1, f2, left, total, 1);
- double right_npmi = ca_npmi(f1, f2, right, total, 1);
- double ld = ca_logdice(f1, f2, sum, total, true_window_size);
-
- int bestWindow = usedPositions;
- double bestAF = ld;
- double currentAF;
- if(f1<75000000)
- for (int bitmask=1; bitmask < (1 << (2*WINDOW_SIZE)); bitmask++) {
- if((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
- uint64_t currentWindowSum=0;
- for (int pos=0; pos < 2*WINDOW_SIZE; pos++) {
- if (((1<<pos) & bitmask & usedPositions) != 0)
- currentWindowSum+=sumWindow[pos];
- }
- currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
- if(currentAF > bestAF) {
- bestAF = currentAF;
- bestWindow = bitmask;
- }
+ collocators.push_back({});
+ rocksdb::Collocator *result = &(collocators[collocators.size()-1]);
+#pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions, true_window_size) shared(w1, result) if(sum > 1000000)
+ {
+ // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2 *WINDOW_SIZE);
+ // memcpy(nsw, sumWindow, sizeof(uint64_t) * 2 *WINDOW_SIZE);
+ applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions, true_window_size, result);
+ // free(nsw);
}
- collocators.push_back ( {last_w2, f2, sum,
- pmi, pmi / (-log2(o/total/true_window_size)), /* normalize to [-1,1] */
- llr, lfmd, md,
- left_lfmd,
- right_lfmd,
- left_npmi,
- right_npmi,
- ca_dice(f1, f2, sum, total, true_window_size),
- ld,
- bestAF,
- usedPositions,
- bestWindow
- }
- );
}
memset(sumWindow, 0, 2*WINDOW_SIZE * sizeof(uint64_t));
usedPositions = 1 << (-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0));
@@ -636,36 +654,31 @@
sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
true_window_size++;
}
- if(DIST(key) == -1)
- left = value;
- else if(DIST(key) == 1)
- right = value;
}
- sort(collocators.begin(), collocators.end(), sortByLogDice);
-
- /*
+#pragma omp taskwait
+ sort(collocators.begin(), collocators.end(), sortByLogDice);
+
int i=0;
for (Collocator c : collocators) {
if(i++>10) break;
- std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
+ std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word << "*"
<< "\t f(w1):" << _vocab[w1].freq
<< "\t f(w2):" << _vocab[c.w2].freq
- << "\t f(w1, x):" << total_w1
<< "\t f(w1, w2):" << c.raw
<< "\t pmi:" << c.pmi
<< "\t npmi:" << c.npmi
<< "\t llr:" << c.llr
+ << "\t md:" << c.md
<< "\t lfmd:" << c.lfmd
- << "\t fpmi:" << c.fpmi
<< "\t total:" << total
<< std::endl;
}
- */
- return collocators;
+
+ return collocators;
}
- std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
+ std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
return get_collocators(w1, UINT32_MAX);
}
@@ -747,7 +760,7 @@
"}";
}
s << "]\n";
- // cout << s.str();
+ std::cout << s.str();
return s.str();
}