Marc Kupietz | 3400aa5 | 2018-06-05 10:28:55 +0200 | [diff] [blame] | 1 | #!/usr/bin/python3 |
| 2 | from __future__ import print_function |
| 3 | import numpy |
| 4 | import sys |
| 5 | import nmslib |
| 6 | import time |
| 7 | import math |
| 8 | import os.path |
| 9 | from scipy.sparse import csr_matrix |
| 10 | |
| 11 | def eprint(*args, **kwargs): |
| 12 | print(*args, file=sys.stderr, **kwargs) |
| 13 | |
| 14 | def read_data(filename, max_qty = None): |
| 15 | row = [] |
| 16 | col = [] |
| 17 | data = [] |
| 18 | read_qty = 0 |
| 19 | row_max = 0 |
| 20 | with open(filename,'r') as f: |
| 21 | read_num_ft = 0 |
| 22 | for line in f: |
| 23 | x = line.strip().split() |
| 24 | if (len(x) == 0): continue |
| 25 | if (len(x) % 2 != 0): |
| 26 | raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename))) |
| 27 | for i in range(0, len(x), 2): |
| 28 | #row.append(int(x[0])-1) |
| 29 | row.append(read_qty) |
| 30 | feat_id = int(x[i]) |
| 31 | read_num_ft = max(read_num_ft, feat_id + 1) |
| 32 | col.append(feat_id) |
| 33 | data.append(float(x[i+1])) |
| 34 | |
| 35 | read_qty = read_qty+1 |
| 36 | # row_max = max(row_max, int(x[0])) |
| 37 | if max_qty != None and read_qty >= max_qty: break |
| 38 | if (read_qty % 10) == 0: |
| 39 | eprint('Read %d rows' % read_qty) |
| 40 | eprint('Read %d rows, # of features %d' % (read_qty, read_num_ft)) |
| 41 | ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))), |
| 42 | shape=(read_qty, read_num_ft)) |
| 43 | return (read_qty, ft_mat) |
| 44 | |
| 45 | input_file = sys.argv[1] |
| 46 | |
| 47 | # This file will contain nearest neighbors, one per line: |
| 48 | # node [tab char] neighbor_1 neighbor_2 ... |
| 49 | |
| 50 | out_file = input_file + ".bin" |
| 51 | |
| 52 | (all_qty, data_matrix) = read_data(input_file) |
| 53 | # Set index parameters |
| 54 | # These are the most important onese |
| 55 | M = 30 |
| 56 | efC = 100 |
| 57 | |
| 58 | num_threads = 70 |
| 59 | index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0} |
| 60 | K=100 |
| 61 | # Intitialize the library, specify the space, the type of the vector and add data points |
| 62 | index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) |
| 63 | index.addDataPointBatch(data_matrix) |
| 64 | eprint('Starting index creation.') |
| 65 | # Create an index |
| 66 | start = time.time() |
| 67 | index.createIndex(index_time_params, print_progress=True) |
| 68 | end = time.time() |
| 69 | eprint('Index-time parameters', index_time_params) |
| 70 | eprint('Indexing time = %f' % (end-start)) |
| 71 | |
| 72 | # Setting query-time parameters |
| 73 | efS = 100 |
| 74 | query_time_params = {'efSearch': efS} |
| 75 | eprint('Setting query-time parameters', query_time_params) |
| 76 | index.setQueryTimeParams(query_time_params) |
| 77 | # Querying |
| 78 | query_qty = data_matrix.shape[0] |
| 79 | start = time.time() |
| 80 | nbrs = index.knnQueryBatch(data_matrix, k = K, num_threads = num_threads) |
| 81 | end = time.time() |
| 82 | eprint('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % |
| 83 | (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) |
| 84 | |
| 85 | for i in range(0, len(nbrs), 1): |
| 86 | for j in range(0, len(nbrs[i][0]), 1): |
| 87 | print("%d %f " % (nbrs[i][0][j], nbrs[i][1][j]), end='') |
| 88 | print() |
| 89 | #index.saveIndex('sparse_index.bin') |