Blame - extras/ccdbknn.py - ids-kl/collocatordb

blob: dd7e86c301f50fe749b6118b4406726645c34e07 [file] [log] [blame]

Marc Kupietz	3400aa5	2018-06-05 10:28:55 +0200	[diff] [blame]	1	#!/usr/bin/python3
				2	from __future__ import print_function
				3	import numpy
				4	import sys
				5	import nmslib
				6	import time
				7	import math
				8	import os.path
				9	from scipy.sparse import csr_matrix
				10
				11	def eprint(args, *kwargs):
				12	print(args, file=sys.stderr, *kwargs)
				13
				14	def read_data(filename, max_qty = None):
				15	row = []
				16	col = []
				17	data = []
				18	read_qty = 0
				19	row_max = 0
				20	with open(filename,'r') as f:
				21	read_num_ft = 0
				22	for line in f:
				23	x = line.strip().split()
				24	if (len(x) == 0): continue
				25	if (len(x) % 2 != 0):
				26	raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
				27	for i in range(0, len(x), 2):
				28	#row.append(int(x[0])-1)
				29	row.append(read_qty)
				30	feat_id = int(x[i])
				31	read_num_ft = max(read_num_ft, feat_id + 1)
				32	col.append(feat_id)
				33	data.append(float(x[i+1]))
				34
				35	read_qty = read_qty+1
				36	# row_max = max(row_max, int(x[0]))
				37	if max_qty != None and read_qty >= max_qty: break
				38	if (read_qty % 10) == 0:
				39	eprint('Read %d rows' % read_qty)
				40	eprint('Read %d rows, # of features %d' % (read_qty, read_num_ft))
				41	ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))),
				42	shape=(read_qty, read_num_ft))
				43	return (read_qty, ft_mat)
				44
				45	input_file = sys.argv[1]
				46
				47	# This file will contain nearest neighbors, one per line:
				48	# node [tab char] neighbor_1 neighbor_2 ...
				49
				50	out_file = input_file + ".bin"
				51
				52	(all_qty, data_matrix) = read_data(input_file)
				53	# Set index parameters
				54	# These are the most important onese
				55	M = 30
				56	efC = 100
				57
				58	num_threads = 70
				59	index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
				60	K=100
				61	# Intitialize the library, specify the space, the type of the vector and add data points
				62	index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
				63	index.addDataPointBatch(data_matrix)
				64	eprint('Starting index creation.')
				65	# Create an index
				66	start = time.time()
				67	index.createIndex(index_time_params, print_progress=True)
				68	end = time.time()
				69	eprint('Index-time parameters', index_time_params)
				70	eprint('Indexing time = %f' % (end-start))
				71
				72	# Setting query-time parameters
				73	efS = 100
				74	query_time_params = {'efSearch': efS}
				75	eprint('Setting query-time parameters', query_time_params)
				76	index.setQueryTimeParams(query_time_params)
				77	# Querying
				78	query_qty = data_matrix.shape[0]
				79	start = time.time()
				80	nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
				81	end = time.time()
				82	eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
				83	(end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
				84
				85	for i in range(0, len(nbrs), 1):
				86	for j in range(0, len(nbrs[i][0]), 1):
				87	print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
				88	print()
				89	#index.saveIndex('sparse_index.bin')