collocatordb: add support for offline similar profiles calculation ./dumpllr ../Analysemethoden/word2vec/models/dereko-2017-ii > dereko.llr python3 ccdbknn.py dereko.llr > dereko.sprofiles

commit: 3400aa5ea896f3016bb63d0dcba632007d528a01 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Jun 05 10:28:55 2018 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Jun 05 10:47:20 2018 +0200
tree: cda45499760b30d996f881c703d2fea49b87f12f
parent: 51f93790b7a0d478ae89d49f02f92c429815c88d [diff]
diff --git a/Makefile b/Makefile
index 3464afb..b6d0227 100644
--- a/Makefile
+++ b/Makefile

@@ -18,6 +18,9 @@
 testcdb: testcdb.cc collocatordb.h collocatordb.o Makefile
 	$(CXX) $(CXXFLAGS) -L. -L/usr/local/lib $@.cc -o$@ collocatordb.o -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
+dumpllr: dumpllr.cc collocatordb.h collocatordb.o Makefile
+	$(CXX) $(CXXFLAGS) -L. -L/usr/local/lib $@.cc -fopenmp -o$@ collocatordb.o /vol/work/kupietz/rocksdb/librocksdb.a $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
 c_testcdb: c_testcdb.c collocatordb.h collocatordb.o Makefile
 	$(CC) $(CFLAGS) -L. -L/usr/local/lib $@.c -o$@ collocatordb.o -std=gnu99 -lstdc++ -lm -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CCFLAGS) $(EXEC_LDFLAGS)
 

diff --git a/ccdbknn.py b/ccdbknn.py
new file mode 100644
index 0000000..dd7e86c
--- /dev/null
+++ b/ccdbknn.py

@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+from __future__ import print_function
+import numpy 
+import sys 
+import nmslib 
+import time 
+import math 
+import os.path
+from scipy.sparse import csr_matrix
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+def read_data(filename, max_qty = None): 
+    row = []
+    col = []
+    data = []
+    read_qty = 0
+    row_max = 0
+    with open(filename,'r') as f:
+        read_num_ft = 0
+        for line in f:
+            x = line.strip().split()
+            if (len(x) == 0): continue
+            if (len(x) % 2 != 0):
+                raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
+            for i in range(0, len(x), 2):
+                #row.append(int(x[0])-1) 
+                row.append(read_qty) 
+                feat_id = int(x[i])
+                read_num_ft = max(read_num_ft, feat_id + 1)
+                col.append(feat_id)
+                data.append(float(x[i+1]))
+
+            read_qty = read_qty+1
+            # row_max = max(row_max, int(x[0]))
+            if max_qty != None and read_qty >= max_qty: break
+            if (read_qty % 10) == 0:
+                eprint('Read %d rows' % read_qty)
+    eprint('Read %d rows, # of features %d' %  (read_qty, read_num_ft))
+    ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))), 
+                        shape=(read_qty, read_num_ft)) 
+    return (read_qty, ft_mat)
+
+input_file = sys.argv[1]
+
+# This file will contain nearest neighbors, one per line:
+# node [tab char] neighbor_1 neighbor_2 ...
+
+out_file = input_file + ".bin"
+
+(all_qty, data_matrix) = read_data(input_file)
+# Set index parameters
+# These are the most important onese
+M = 30
+efC = 100
+
+num_threads = 70
+index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
+K=100
+# Intitialize the library, specify the space, the type of the vector and add data points 
+index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) 
+index.addDataPointBatch(data_matrix)
+eprint('Starting index creation.')
+# Create an index
+start = time.time()
+index.createIndex(index_time_params, print_progress=True) 
+end = time.time() 
+eprint('Index-time parameters', index_time_params)
+eprint('Indexing time = %f' % (end-start))
+
+# Setting query-time parameters
+efS = 100
+query_time_params = {'efSearch': efS}
+eprint('Setting query-time parameters', query_time_params)
+index.setQueryTimeParams(query_time_params)
+# Querying
+query_qty = data_matrix.shape[0]
+start = time.time() 
+nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
+end = time.time() 
+eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
+      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
+
+for i in range(0, len(nbrs), 1):
+    for j in range(0, len(nbrs[i][0]), 1):
+        print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
+    print()
+#index.saveIndex('sparse_index.bin')

diff --git a/collocatordb.cc b/collocatordb.cc
index ccddecb..faedad0 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc

@@ -323,6 +323,7 @@
     virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
     void dump(uint32_t w1, uint32_t w2, int8_t dist);
     vector<Collocator> get_collocators(uint32_t w1);
+    void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
     vector<Collocator> get_collocators_avg(uint32_t w1);
     string collocators2json(vector<Collocator> collocators);
 
@@ -612,6 +613,40 @@
 		return collocators;
   }
 
+  void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
+		std::vector<Collocator> collocators;
+    std::stringstream stream;
+    uint64_t w2, last_w2 = 0xffffffffffffffff;
+    uint64_t maxv = 0, total_w1 = 0;
+    bool first = true;
+    for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
+      uint64_t value = it->intValue(),
+        key = it->intKey();
+      w2 = W2(key);
+      total_w1 += value;
+      if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
+      if (w2 != last_w2) {
+        if(maxv >= min_cooccur) {
+          double llr = calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq);
+          if(first)
+            first = false;
+          else
+           stream << " ";
+          stream << w2  << " " << llr;
+        }
+        last_w2 = w2;
+        maxv = value;
+      } else {
+        if(value > maxv)
+          maxv = value;
+      }
+    }
+    if(first)
+      stream  << "1 0.0";
+    stream  << "\n";
+    std::cout << stream.str();
+  }
+
   rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
   rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
   rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }

diff --git a/collocatordb.h b/collocatordb.h
index fe76e6c..49ef66c 100644
--- a/collocatordb.h
+++ b/collocatordb.h

@@ -12,6 +12,21 @@
 
 #ifdef __cplusplus
 namespace rocksdb {
+    class Collocator {
+    public:
+    uint64_t w2;
+    uint64_t raw;
+    double pmi;
+    double npmi;
+    double llr;
+    double lfmd;
+    double fpmi;
+    double left_lfmd;
+    double right_lfmd;
+    double left_npmi;
+    double right_npmi;
+  };
+
     class CollocatorIterator : public Iterator  {
     public:
         CollocatorIterator(const Iterator& it);
@@ -27,8 +42,10 @@
 		extern "C" {
 			class CollocatorDB {
 			public:
-        CollocatorDB(const char *db_name);
-        ~CollocatorDB();
+        std::string getWord(uint32_t w1);
+        std::vector<Collocator> get_collocators(uint32_t w1);
+        void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
+        CollocatorDB(const char *db_name, const bool read_only);
         void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
         void dump(const uint32_t w1, const uint32_t w2, const uint8_t dist);
         CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);

diff --git a/dumpllr.cc b/dumpllr.cc
new file mode 100644
index 0000000..00158a5
--- /dev/null
+++ b/dumpllr.cc

@@ -0,0 +1,44 @@
+#include <typeinfo>
+#include <assert.h>
+#include <memory>
+#include <iostream>
+#include <stdint.h>
+#include "collocatordb.h"
+#include <thread>
+#include <chrono>
+#include <sstream> // for ostringstream
+
+using namespace rocksdb;
+
+  
+int main(int argc, char** argv) {
+  const int START=0;
+  const int STOP=1500000;
+  int done = 0;
+  CollocatorDB cdb = CollocatorDB(argv[1], true);
+  std::cerr << "Database " << argv[1] << " opened\n";
+
+  #pragma omp parallel for ordered schedule(static,1)
+  for(uint32_t i=START; i< STOP; i++) {
+    //    cdb.dumpSparseLlr(i, 5);
+    std::vector<rocksdb::Collocator> cs = cdb.get_collocators(i);
+    std::stringstream stream;
+    // stream << i << "(" << cdb.getWord(i) << "): ";
+    if(cs.empty())
+      stream << "0 0.0";
+    for (rocksdb::Collocator c : cs) {
+      stream << c.w2 << " " << c.npmi << " ";
+      // stream << c.w2 << "(" << cdb.getWord(c.w2) << ") " << c.llr << " ";
+      if(c.raw < 5)
+        break;
+    }
+    stream  << "\n";
+    #pragma omp ordered
+    std::cout << stream.str();
+    if(done++  % 100 == 0) {
+      std::cerr <<"\r\033[2K"<<std::flush;
+      std::cerr << "done: " << done * 100.0 / (STOP-START) << "%" <<std::flush;
+    }
+  }
+  std::cout << std::flush;
+}
commit	3400aa5ea896f3016bb63d0dcba632007d528a01	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Jun 05 10:28:55 2018 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Jun 05 10:47:20 2018 +0200
tree	cda45499760b30d996f881c703d2fea49b87f12f
parent	51f93790b7a0d478ae89d49f02f92c429815c88d [diff]