collocatordb: add support for offline similar profiles calculation
./dumpllr ../Analysemethoden/word2vec/models/dereko-2017-ii > dereko.llr
python3 ccdbknn.py dereko.llr > dereko.sprofiles
diff --git a/Makefile b/Makefile
index 3464afb..b6d0227 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,9 @@
testcdb: testcdb.cc collocatordb.h collocatordb.o Makefile
$(CXX) $(CXXFLAGS) -L. -L/usr/local/lib $@.cc -o$@ collocatordb.o -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+dumpllr: dumpllr.cc collocatordb.h collocatordb.o Makefile
+ $(CXX) $(CXXFLAGS) -L. -L/usr/local/lib $@.cc -fopenmp -o$@ collocatordb.o /vol/work/kupietz/rocksdb/librocksdb.a $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
c_testcdb: c_testcdb.c collocatordb.h collocatordb.o Makefile
$(CC) $(CFLAGS) -L. -L/usr/local/lib $@.c -o$@ collocatordb.o -std=gnu99 -lstdc++ -lm -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CCFLAGS) $(EXEC_LDFLAGS)
diff --git a/ccdbknn.py b/ccdbknn.py
new file mode 100644
index 0000000..dd7e86c
--- /dev/null
+++ b/ccdbknn.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+from __future__ import print_function
+import numpy
+import sys
+import nmslib
+import time
+import math
+import os.path
+from scipy.sparse import csr_matrix
+
+def eprint(*args, **kwargs):
+ print(*args, file=sys.stderr, **kwargs)
+
+def read_data(filename, max_qty = None):
+ row = []
+ col = []
+ data = []
+ read_qty = 0
+ row_max = 0
+ with open(filename,'r') as f:
+ read_num_ft = 0
+ for line in f:
+ x = line.strip().split()
+ if (len(x) == 0): continue
+ if (len(x) % 2 != 0):
+ raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
+ for i in range(0, len(x), 2):
+ #row.append(int(x[0])-1)
+ row.append(read_qty)
+ feat_id = int(x[i])
+ read_num_ft = max(read_num_ft, feat_id + 1)
+ col.append(feat_id)
+ data.append(float(x[i+1]))
+
+ read_qty = read_qty+1
+ # row_max = max(row_max, int(x[0]))
+ if max_qty != None and read_qty >= max_qty: break
+ if (read_qty % 10) == 0:
+ eprint('Read %d rows' % read_qty)
+ eprint('Read %d rows, # of features %d' % (read_qty, read_num_ft))
+ ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))),
+ shape=(read_qty, read_num_ft))
+ return (read_qty, ft_mat)
+
+input_file = sys.argv[1]
+
+# This file will contain nearest neighbors, one per line:
+# node [tab char] neighbor_1 neighbor_2 ...
+
+out_file = input_file + ".bin"
+
+(all_qty, data_matrix) = read_data(input_file)
+# Set index parameters
+# These are the most important onese
+M = 30
+efC = 100
+
+num_threads = 70
+index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
+K=100
+# Intitialize the library, specify the space, the type of the vector and add data points
+index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
+index.addDataPointBatch(data_matrix)
+eprint('Starting index creation.')
+# Create an index
+start = time.time()
+index.createIndex(index_time_params, print_progress=True)
+end = time.time()
+eprint('Index-time parameters', index_time_params)
+eprint('Indexing time = %f' % (end-start))
+
+# Setting query-time parameters
+efS = 100
+query_time_params = {'efSearch': efS}
+eprint('Setting query-time parameters', query_time_params)
+index.setQueryTimeParams(query_time_params)
+# Querying
+query_qty = data_matrix.shape[0]
+start = time.time()
+nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
+end = time.time()
+eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
+ (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
+
+for i in range(0, len(nbrs), 1):
+ for j in range(0, len(nbrs[i][0]), 1):
+ print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
+ print()
+#index.saveIndex('sparse_index.bin')
diff --git a/collocatordb.cc b/collocatordb.cc
index ccddecb..faedad0 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -323,6 +323,7 @@
virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
void dump(uint32_t w1, uint32_t w2, int8_t dist);
vector<Collocator> get_collocators(uint32_t w1);
+ void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
vector<Collocator> get_collocators_avg(uint32_t w1);
string collocators2json(vector<Collocator> collocators);
@@ -612,6 +613,40 @@
return collocators;
}
+ void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
+ std::vector<Collocator> collocators;
+ std::stringstream stream;
+ uint64_t w2, last_w2 = 0xffffffffffffffff;
+ uint64_t maxv = 0, total_w1 = 0;
+ bool first = true;
+ for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
+ uint64_t value = it->intValue(),
+ key = it->intKey();
+ w2 = W2(key);
+ total_w1 += value;
+ if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
+ if (w2 != last_w2) {
+ if(maxv >= min_cooccur) {
+ double llr = calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq);
+ if(first)
+ first = false;
+ else
+ stream << " ";
+ stream << w2 << " " << llr;
+ }
+ last_w2 = w2;
+ maxv = value;
+ } else {
+ if(value > maxv)
+ maxv = value;
+ }
+ }
+ if(first)
+ stream << "1 0.0";
+ stream << "\n";
+ std::cout << stream.str();
+ }
+
rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
diff --git a/collocatordb.h b/collocatordb.h
index fe76e6c..49ef66c 100644
--- a/collocatordb.h
+++ b/collocatordb.h
@@ -12,6 +12,21 @@
#ifdef __cplusplus
namespace rocksdb {
+ class Collocator {
+ public:
+ uint64_t w2;
+ uint64_t raw;
+ double pmi;
+ double npmi;
+ double llr;
+ double lfmd;
+ double fpmi;
+ double left_lfmd;
+ double right_lfmd;
+ double left_npmi;
+ double right_npmi;
+ };
+
class CollocatorIterator : public Iterator {
public:
CollocatorIterator(const Iterator& it);
@@ -27,8 +42,10 @@
extern "C" {
class CollocatorDB {
public:
- CollocatorDB(const char *db_name);
- ~CollocatorDB();
+ std::string getWord(uint32_t w1);
+ std::vector<Collocator> get_collocators(uint32_t w1);
+ void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
+ CollocatorDB(const char *db_name, const bool read_only);
void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
void dump(const uint32_t w1, const uint32_t w2, const uint8_t dist);
CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
diff --git a/dumpllr.cc b/dumpllr.cc
new file mode 100644
index 0000000..00158a5
--- /dev/null
+++ b/dumpllr.cc
@@ -0,0 +1,44 @@
+#include <typeinfo>
+#include <assert.h>
+#include <memory>
+#include <iostream>
+#include <stdint.h>
+#include "collocatordb.h"
+#include <thread>
+#include <chrono>
+#include <sstream> // for ostringstream
+
+using namespace rocksdb;
+
+
+int main(int argc, char** argv) {
+ const int START=0;
+ const int STOP=1500000;
+ int done = 0;
+ CollocatorDB cdb = CollocatorDB(argv[1], true);
+ std::cerr << "Database " << argv[1] << " opened\n";
+
+ #pragma omp parallel for ordered schedule(static,1)
+ for(uint32_t i=START; i< STOP; i++) {
+ // cdb.dumpSparseLlr(i, 5);
+ std::vector<rocksdb::Collocator> cs = cdb.get_collocators(i);
+ std::stringstream stream;
+ // stream << i << "(" << cdb.getWord(i) << "): ";
+ if(cs.empty())
+ stream << "0 0.0";
+ for (rocksdb::Collocator c : cs) {
+ stream << c.w2 << " " << c.npmi << " ";
+ // stream << c.w2 << "(" << cdb.getWord(c.w2) << ") " << c.llr << " ";
+ if(c.raw < 5)
+ break;
+ }
+ stream << "\n";
+ #pragma omp ordered
+ std::cout << stream.str();
+ if(done++ % 100 == 0) {
+ std::cerr <<"\r\033[2K"<<std::flush;
+ std::cerr << "done: " << done * 100.0 / (STOP-START) << "%" <<std::flush;
+ }
+ }
+ std::cout << std::flush;
+}