#!/usr/bin/env python #coding:utf-8 import numpy as np import scipy as sp from scipy.sparse import * from numpy.random import * from collections import OrderedDict from datetime import datetime # performance logger import cProfile as profile import pstats from itertools import groupby from operator import itemgetter def gen_mat(M, N, R, format="csc"): """generating lil_matrix with random valuables""" mat = sp.sparse.rand(M, N, R, format=format, dtype=np.dtype('float64')) mat = mat * 250 print("{0}x{1} matrix (sparsity:{2}, # of data:{3}) is generated...".format(M, N, R, mat.nnz)) return mat def join_columns1(spamat, target_col): """join columns using getcol() which requires object copying""" return reduce(lambda x,y: x+y, [spamat[:,index] for index in target_col]) def join_columns2(lilmat, target_col): """join columns using lil_matrix's getrowview() which does not require object copying """ lilt = lilmat.T return reduce(lambda x,y: x+y, (lilt.getrowview(index) for index in target_col)) def join_columns3(coomat, N): """join columns using coo indices, numpy's fancy index and pydict""" dic = {} def dic_update(key, val): if (dic.has_key(key)): dic[key] += val else: dic[key] = val mask = (coomat.col % 26 == 0) row = coomat.row[mask] data = coomat.data[mask] n = len(row) [dic_update(row[index], data[index]) for index in range(n)] return dic def get_elapsed_time(func, listargs): """wrapper for calculating consuming time of function""" s = datetime.now() ret = func(*listargs) e = datetime.now() print("{0}\t{1}".format((e-s), func.__name__)) return ret def bench(M, N, R, L): column_indexes = frozenset([i for i in range(0, N, 26)]) print("# lil_matrix") lilmat = gen_mat(M, N, R, format="lil") # get_elapsed_time(join_columns1, (lilmat, column_indexes)) # very slow! get_elapsed_time(join_columns2, (lilmat, column_indexes)) # bit slow print("# csc_matrix") cscmat = lilmat.tocsc() get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast print("# csr_matrix") csrmat = cscmat.tocsr() get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast print("# coo_matrix") coomat = cscmat.tocoo() get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast get_elapsed_time(join_columns3, (coomat, N)) # very fast def main(): M = 1400000 # n_row N = 50000 # n_col R = 0.0005 # sparsity L = int(N * 0.2) # column_indexes print("-------------------------------------") print("M:{0}, N:{1}, R:{2}, L:{3}".format(M, N, R, L)) print("-------------------------------------") bench(M, N, R, L) if __name__ == "__main__": main()
------------------------------------- M:1400, N:5000, R:0.001, L:1000 ------------------------------------- 1400x5000 matrix (sparsity:0.001, # of data:7000) is generated... # lil_matrix 0:00:01.576205 join_columns1 0:00:00.044499 join_columns2 # csc_matrix 0:00:00.047233 join_columns1 # csr_matrix 0:00:00.047329 join_columns1 # coo_matrix 0:00:00.047277 join_columns1 0:00:00.000282 join_columns3 ------------------------------------- M:140000, N:50000, R:0.001, L:10000 ------------------------------------- 140000x50000 matrix (sparsity:0.001, # of data:7000000) is generated... # lil_matrix 0:00:03.003079 join_columns2 # csc_matrix 0:00:02.640237 join_columns1 # csr_matrix 0:00:02.640100 join_columns1 # coo_matrix 0:00:02.647828 join_columns1 0:00:00.300576 join_columns3