ccdbknn.py - ids-kl/collocatordb - Gitiles

 #!/usr/bin/python3
 from __future__ import print_function
 import numpy
 import sys
 import nmslib
 import time
 import math
 import os.path
 from scipy.sparse import csr_matrix

 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)

 def read_data(filename, max_qty = None):
     row = []
     col = []
     data = []
     read_qty = 0
     row_max = 0
     with open(filename,'r') as f:
         read_num_ft = 0
         for line in f:
             x = line.strip().split()
             if (len(x) == 0): continue
             if (len(x) % 2 != 0):
                 raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
             for i in range(0, len(x), 2):
                 #row.append(int(x[0])-1)
                 row.append(read_qty)
                 feat_id = int(x[i])
                 read_num_ft = max(read_num_ft, feat_id + 1)
                 col.append(feat_id)
                 data.append(float(x[i+1]))

             read_qty = read_qty+1
             # row_max = max(row_max, int(x[0]))
             if max_qty != None and read_qty >= max_qty: break
             if (read_qty % 10) == 0:
                 eprint('Read %d rows' % read_qty)
     eprint('Read %d rows, # of features %d' %  (read_qty, read_num_ft))
     ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))),
                         shape=(read_qty, read_num_ft))
     return (read_qty, ft_mat)

 input_file = sys.argv[1]

 # This file will contain nearest neighbors, one per line:
 # node [tab char] neighbor_1 neighbor_2 ...

 out_file = input_file + ".bin"

 (all_qty, data_matrix) = read_data(input_file)
 # Set index parameters
 # These are the most important onese
 M = 30
 efC = 100

 num_threads = 70
 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
 K=100
 # Intitialize the library, specify the space, the type of the vector and add data points
 index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
 index.addDataPointBatch(data_matrix)
 eprint('Starting index creation.')
 # Create an index
 start = time.time()
 index.createIndex(index_time_params, print_progress=True)
 end = time.time()
 eprint('Index-time parameters', index_time_params)
 eprint('Indexing time = %f' % (end-start))

 # Setting query-time parameters
 efS = 100
 query_time_params = {'efSearch': efS}
 eprint('Setting query-time parameters', query_time_params)
 index.setQueryTimeParams(query_time_params)
 # Querying
 query_qty = data_matrix.shape[0]
 start = time.time()
 nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
 end = time.time()
 eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
       (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

 for i in range(0, len(nbrs), 1):
     for j in range(0, len(nbrs[i][0]), 1):
         print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
     print()
 #index.saveIndex('sparse_index.bin')
	#!/usr/bin/python3
	from __future__ import print_function
	import numpy
	import sys
	import nmslib
	import time
	import math
	import os.path
	from scipy.sparse import csr_matrix

	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)

	def read_data(filename, max_qty = None):
	row = []
	col = []
	data = []
	read_qty = 0
	row_max = 0
	with open(filename,'r') as f:
	read_num_ft = 0
	for line in f:
	x = line.strip().split()
	if (len(x) == 0): continue
	if (len(x) % 2 != 0):
	raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
	for i in range(0, len(x), 2):
	#row.append(int(x[0])-1)
	row.append(read_qty)
	feat_id = int(x[i])
	read_num_ft = max(read_num_ft, feat_id + 1)
	col.append(feat_id)
	data.append(float(x[i+1]))

	read_qty = read_qty+1
	# row_max = max(row_max, int(x[0]))
	if max_qty != None and read_qty >= max_qty: break
	if (read_qty % 10) == 0:
	eprint('Read %d rows' % read_qty)
	eprint('Read %d rows, # of features %d' % (read_qty, read_num_ft))
	ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))),
	shape=(read_qty, read_num_ft))
	return (read_qty, ft_mat)

	input_file = sys.argv[1]

	# This file will contain nearest neighbors, one per line:
	# node [tab char] neighbor_1 neighbor_2 ...

	out_file = input_file + ".bin"

	(all_qty, data_matrix) = read_data(input_file)
	# Set index parameters
	# These are the most important onese
	M = 30
	efC = 100

	num_threads = 70
	index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
	K=100
	# Intitialize the library, specify the space, the type of the vector and add data points
	index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
	index.addDataPointBatch(data_matrix)
	eprint('Starting index creation.')
	# Create an index
	start = time.time()
	index.createIndex(index_time_params, print_progress=True)
	end = time.time()
	eprint('Index-time parameters', index_time_params)
	eprint('Indexing time = %f' % (end-start))

	# Setting query-time parameters
	efS = 100
	query_time_params = {'efSearch': efS}
	eprint('Setting query-time parameters', query_time_params)
	index.setQueryTimeParams(query_time_params)
	# Querying
	query_qty = data_matrix.shape[0]
	start = time.time()
	nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
	end = time.time()
	eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
	(end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

	for i in range(0, len(nbrs), 1):
	for j in range(0, len(nbrs[i][0]), 1):
	print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
	print()
	#index.saveIndex('sparse_index.bin')