collocatordb: add support for offline similar profiles calculation
./dumpllr ../Analysemethoden/word2vec/models/dereko-2017-ii > dereko.llr
python3 ccdbknn.py dereko.llr > dereko.sprofiles
diff --git a/ccdbknn.py b/ccdbknn.py
new file mode 100644
index 0000000..dd7e86c
--- /dev/null
+++ b/ccdbknn.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+from __future__ import print_function
+import numpy
+import sys
+import nmslib
+import time
+import math
+import os.path
+from scipy.sparse import csr_matrix
+
+def eprint(*args, **kwargs):
+ print(*args, file=sys.stderr, **kwargs)
+
+def read_data(filename, max_qty = None):
+ row = []
+ col = []
+ data = []
+ read_qty = 0
+ row_max = 0
+ with open(filename,'r') as f:
+ read_num_ft = 0
+ for line in f:
+ x = line.strip().split()
+ if (len(x) == 0): continue
+ if (len(x) % 2 != 0):
+ raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
+ for i in range(0, len(x), 2):
+ #row.append(int(x[0])-1)
+ row.append(read_qty)
+ feat_id = int(x[i])
+ read_num_ft = max(read_num_ft, feat_id + 1)
+ col.append(feat_id)
+ data.append(float(x[i+1]))
+
+ read_qty = read_qty+1
+ # row_max = max(row_max, int(x[0]))
+ if max_qty != None and read_qty >= max_qty: break
+ if (read_qty % 10) == 0:
+ eprint('Read %d rows' % read_qty)
+ eprint('Read %d rows, # of features %d' % (read_qty, read_num_ft))
+ ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))),
+ shape=(read_qty, read_num_ft))
+ return (read_qty, ft_mat)
+
+input_file = sys.argv[1]
+
+# This file will contain nearest neighbors, one per line:
+# node [tab char] neighbor_1 neighbor_2 ...
+
+out_file = input_file + ".bin"
+
+(all_qty, data_matrix) = read_data(input_file)
+# Set index parameters
+# These are the most important onese
+M = 30
+efC = 100
+
+num_threads = 70
+index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
+K=100
+# Intitialize the library, specify the space, the type of the vector and add data points
+index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
+index.addDataPointBatch(data_matrix)
+eprint('Starting index creation.')
+# Create an index
+start = time.time()
+index.createIndex(index_time_params, print_progress=True)
+end = time.time()
+eprint('Index-time parameters', index_time_params)
+eprint('Indexing time = %f' % (end-start))
+
+# Setting query-time parameters
+efS = 100
+query_time_params = {'efSearch': efS}
+eprint('Setting query-time parameters', query_time_params)
+index.setQueryTimeParams(query_time_params)
+# Querying
+query_qty = data_matrix.shape[0]
+start = time.time()
+nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
+end = time.time()
+eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
+ (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
+
+for i in range(0, len(nbrs), 1):
+ for j in range(0, len(nbrs[i][0]), 1):
+ print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
+ print()
+#index.saveIndex('sparse_index.bin')