collocatordb: add support for offline similar profiles calculation
./dumpllr ../Analysemethoden/word2vec/models/dereko-2017-ii > dereko.llr
python3 ccdbknn.py dereko.llr > dereko.sprofiles
diff --git a/dumpllr.cc b/dumpllr.cc
new file mode 100644
index 0000000..00158a5
--- /dev/null
+++ b/dumpllr.cc
@@ -0,0 +1,44 @@
+#include <typeinfo>
+#include <assert.h>
+#include <memory>
+#include <iostream>
+#include <stdint.h>
+#include "collocatordb.h"
+#include <thread>
+#include <chrono>
+#include <sstream> // for ostringstream
+
+using namespace rocksdb;
+
+
+int main(int argc, char** argv) {
+ const int START=0;
+ const int STOP=1500000;
+ int done = 0;
+ CollocatorDB cdb = CollocatorDB(argv[1], true);
+ std::cerr << "Database " << argv[1] << " opened\n";
+
+ #pragma omp parallel for ordered schedule(static,1)
+ for(uint32_t i=START; i< STOP; i++) {
+ // cdb.dumpSparseLlr(i, 5);
+ std::vector<rocksdb::Collocator> cs = cdb.get_collocators(i);
+ std::stringstream stream;
+ // stream << i << "(" << cdb.getWord(i) << "): ";
+ if(cs.empty())
+ stream << "0 0.0";
+ for (rocksdb::Collocator c : cs) {
+ stream << c.w2 << " " << c.npmi << " ";
+ // stream << c.w2 << "(" << cdb.getWord(c.w2) << ") " << c.llr << " ";
+ if(c.raw < 5)
+ break;
+ }
+ stream << "\n";
+ #pragma omp ordered
+ std::cout << stream.str();
+ if(done++ % 100 == 0) {
+ std::cerr <<"\r\033[2K"<<std::flush;
+ std::cerr << "done: " << done * 100.0 / (STOP-START) << "%" <<std::flush;
+ }
+ }
+ std::cout << std::flush;
+}