2012/05/31

python tips: effective sparse matrix operation ?

#!/usr/bin/env python
#coding:utf-8
 
import numpy as np
import scipy as sp
from scipy.sparse import *
from numpy.random import *
from collections import OrderedDict
from datetime import datetime
 
# performance logger
import cProfile as profile
import pstats
 
from itertools import groupby
from operator import itemgetter

def gen_mat(M, N, R, format="csc"):
    """generating lil_matrix with random valuables"""
    mat = sp.sparse.rand(M, N, R, format=format, dtype=np.dtype('float64'))
    mat = mat * 250
    print("{0}x{1} matrix (sparsity:{2}, # of data:{3}) is generated...".format(M, N, R, mat.nnz))
    return mat
 
def join_columns1(spamat, target_col):
    """join columns using getcol() which requires object copying"""
    return reduce(lambda x,y: x+y, [spamat[:,index] for index in target_col])

def join_columns2(lilmat, target_col):
    """join columns using lil_matrix's getrowview() which does not require object copying """
    lilt = lilmat.T
    return reduce(lambda x,y: x+y, (lilt.getrowview(index) for index in target_col))

def join_columns3(coomat, N):
    """join columns using coo indices, numpy's fancy index and pydict"""
    dic = {}

    def dic_update(key, val):
        if (dic.has_key(key)):
            dic[key] += val
        else:
            dic[key] = val

    mask = (coomat.col % 26 == 0)
    row = coomat.row[mask]
    data = coomat.data[mask]
    n = len(row)

    [dic_update(row[index], data[index]) for index in range(n)]
    return dic

def get_elapsed_time(func, listargs):
    """wrapper for calculating consuming time of function"""
    s = datetime.now()
    ret = func(*listargs)
    e = datetime.now()
    print("{0}\t{1}".format((e-s), func.__name__))
    return ret
     
def bench(M, N, R, L):
    column_indexes = frozenset([i for i in range(0, N, 26)])

    print("# lil_matrix")
    lilmat = gen_mat(M, N, R, format="lil")
    # get_elapsed_time(join_columns1, (lilmat, column_indexes)) # very slow!
    get_elapsed_time(join_columns2, (lilmat, column_indexes)) # bit slow

    print("# csc_matrix")
    cscmat = lilmat.tocsc()
    get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast

    print("# csr_matrix")
    csrmat = cscmat.tocsr()
    get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast

    print("# coo_matrix")
    coomat = cscmat.tocoo()
    get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast
    get_elapsed_time(join_columns3, (coomat, N))              # very fast

def main():
    M = 1400000      # n_row
    N = 50000        # n_col
    R = 0.0005       # sparsity
    L = int(N * 0.2) # column_indexes
 
    print("-------------------------------------")
    print("M:{0}, N:{1}, R:{2}, L:{3}".format(M, N, R, L))
    print("-------------------------------------")
    bench(M, N, R, L)
     
if __name__ == "__main__":
   main()
-------------------------------------
M:1400, N:5000, R:0.001, L:1000
-------------------------------------
1400x5000 matrix (sparsity:0.001, # of data:7000) is generated...
# lil_matrix
0:00:01.576205 join_columns1
0:00:00.044499 join_columns2
# csc_matrix
0:00:00.047233 join_columns1
# csr_matrix
0:00:00.047329 join_columns1
# coo_matrix
0:00:00.047277 join_columns1
0:00:00.000282 join_columns3

-------------------------------------
M:140000, N:50000, R:0.001, L:10000
-------------------------------------
140000x50000 matrix (sparsity:0.001, # of data:7000000) is generated...
# lil_matrix
0:00:03.003079 join_columns2
# csc_matrix
0:00:02.640237 join_columns1
# csr_matrix
0:00:02.640100 join_columns1
# coo_matrix
0:00:02.647828 join_columns1
0:00:00.300576 join_columns3

No comments:

Post a Comment

100