#!/usr/bin/env python
#coding:utf-8
import numpy as np
import scipy as sp
from scipy.sparse import *
from numpy.random import *
from collections import OrderedDict
from datetime import datetime
# performance logger
import cProfile as profile
import pstats
from itertools import groupby
from operator import itemgetter
def gen_mat(M, N, R, format="csc"):
"""generating lil_matrix with random valuables"""
mat = sp.sparse.rand(M, N, R, format=format, dtype=np.dtype('float64'))
mat = mat * 250
print("{0}x{1} matrix (sparsity:{2}, # of data:{3}) is generated...".format(M, N, R, mat.nnz))
return mat
def join_columns1(spamat, target_col):
"""join columns using getcol() which requires object copying"""
return reduce(lambda x,y: x+y, [spamat[:,index] for index in target_col])
def join_columns2(lilmat, target_col):
"""join columns using lil_matrix's getrowview() which does not require object copying """
lilt = lilmat.T
return reduce(lambda x,y: x+y, (lilt.getrowview(index) for index in target_col))
def join_columns3(coomat, N):
"""join columns using coo indices, numpy's fancy index and pydict"""
dic = {}
def dic_update(key, val):
if (dic.has_key(key)):
dic[key] += val
else:
dic[key] = val
mask = (coomat.col % 26 == 0)
row = coomat.row[mask]
data = coomat.data[mask]
n = len(row)
[dic_update(row[index], data[index]) for index in range(n)]
return dic
def get_elapsed_time(func, listargs):
"""wrapper for calculating consuming time of function"""
s = datetime.now()
ret = func(*listargs)
e = datetime.now()
print("{0}\t{1}".format((e-s), func.__name__))
return ret
def bench(M, N, R, L):
column_indexes = frozenset([i for i in range(0, N, 26)])
print("# lil_matrix")
lilmat = gen_mat(M, N, R, format="lil")
# get_elapsed_time(join_columns1, (lilmat, column_indexes)) # very slow!
get_elapsed_time(join_columns2, (lilmat, column_indexes)) # bit slow
print("# csc_matrix")
cscmat = lilmat.tocsc()
get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast
print("# csr_matrix")
csrmat = cscmat.tocsr()
get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast
print("# coo_matrix")
coomat = cscmat.tocoo()
get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast
get_elapsed_time(join_columns3, (coomat, N)) # very fast
def main():
M = 1400000 # n_row
N = 50000 # n_col
R = 0.0005 # sparsity
L = int(N * 0.2) # column_indexes
print("-------------------------------------")
print("M:{0}, N:{1}, R:{2}, L:{3}".format(M, N, R, L))
print("-------------------------------------")
bench(M, N, R, L)
if __name__ == "__main__":
main()
-------------------------------------
M:1400, N:5000, R:0.001, L:1000
-------------------------------------
1400x5000 matrix (sparsity:0.001, # of data:7000) is generated...
# lil_matrix
0:00:01.576205 join_columns1
0:00:00.044499 join_columns2
# csc_matrix
0:00:00.047233 join_columns1
# csr_matrix
0:00:00.047329 join_columns1
# coo_matrix
0:00:00.047277 join_columns1
0:00:00.000282 join_columns3
-------------------------------------
M:140000, N:50000, R:0.001, L:10000
-------------------------------------
140000x50000 matrix (sparsity:0.001, # of data:7000000) is generated...
# lil_matrix
0:00:03.003079 join_columns2
# csc_matrix
0:00:02.640237 join_columns1
# csr_matrix
0:00:02.640100 join_columns1
# coo_matrix
0:00:02.647828 join_columns1
0:00:00.300576 join_columns3