blob: dd7e86c301f50fe749b6118b4406726645c34e07 [file] [log] [blame]
Marc Kupietz3400aa52018-06-05 10:28:55 +02001#!/usr/bin/python3
2from __future__ import print_function
3import numpy
4import sys
5import nmslib
6import time
7import math
8import os.path
9from scipy.sparse import csr_matrix
10
11def eprint(*args, **kwargs):
12 print(*args, file=sys.stderr, **kwargs)
13
14def read_data(filename, max_qty = None):
15 row = []
16 col = []
17 data = []
18 read_qty = 0
19 row_max = 0
20 with open(filename,'r') as f:
21 read_num_ft = 0
22 for line in f:
23 x = line.strip().split()
24 if (len(x) == 0): continue
25 if (len(x) % 2 != 0):
26 raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))
27 for i in range(0, len(x), 2):
28 #row.append(int(x[0])-1)
29 row.append(read_qty)
30 feat_id = int(x[i])
31 read_num_ft = max(read_num_ft, feat_id + 1)
32 col.append(feat_id)
33 data.append(float(x[i+1]))
34
35 read_qty = read_qty+1
36 # row_max = max(row_max, int(x[0]))
37 if max_qty != None and read_qty >= max_qty: break
38 if (read_qty % 10) == 0:
39 eprint('Read %d rows' % read_qty)
40 eprint('Read %d rows, # of features %d' % (read_qty, read_num_ft))
41 ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))),
42 shape=(read_qty, read_num_ft))
43 return (read_qty, ft_mat)
44
45input_file = sys.argv[1]
46
47# This file will contain nearest neighbors, one per line:
48# node [tab char] neighbor_1 neighbor_2 ...
49
50out_file = input_file + ".bin"
51
52(all_qty, data_matrix) = read_data(input_file)
53# Set index parameters
54# These are the most important onese
55M = 30
56efC = 100
57
58num_threads = 70
59index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
60K=100
61# Intitialize the library, specify the space, the type of the vector and add data points
62index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
63index.addDataPointBatch(data_matrix)
64eprint('Starting index creation.')
65# Create an index
66start = time.time()
67index.createIndex(index_time_params, print_progress=True)
68end = time.time()
69eprint('Index-time parameters', index_time_params)
70eprint('Indexing time = %f' % (end-start))
71
72# Setting query-time parameters
73efS = 100
74query_time_params = {'efSearch': efS}
75eprint('Setting query-time parameters', query_time_params)
76index.setQueryTimeParams(query_time_params)
77# Querying
78query_qty = data_matrix.shape[0]
79start = time.time()
80nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads)
81end = time.time()
82eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
83 (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
84
85for i in range(0, len(nbrs), 1):
86 for j in range(0, len(nbrs[i][0]), 1):
87 print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='')
88 print()
89#index.saveIndex('sparse_index.bin')