2012/05/31

python tips: effective sparse matrix operation ?

#!/usr/bin/env python
#coding:utf-8
 
import numpy as np
import scipy as sp
from scipy.sparse import *
from numpy.random import *
from collections import OrderedDict
from datetime import datetime
 
# performance logger
import cProfile as profile
import pstats
 
from itertools import groupby
from operator import itemgetter

def gen_mat(M, N, R, format="csc"):
    """generating lil_matrix with random valuables"""
    mat = sp.sparse.rand(M, N, R, format=format, dtype=np.dtype('float64'))
    mat = mat * 250
    print("{0}x{1} matrix (sparsity:{2}, # of data:{3}) is generated...".format(M, N, R, mat.nnz))
    return mat
 
def join_columns1(spamat, target_col):
    """join columns using getcol() which requires object copying"""
    return reduce(lambda x,y: x+y, [spamat[:,index] for index in target_col])

def join_columns2(lilmat, target_col):
    """join columns using lil_matrix's getrowview() which does not require object copying """
    lilt = lilmat.T
    return reduce(lambda x,y: x+y, (lilt.getrowview(index) for index in target_col))

def join_columns3(coomat, N):
    """join columns using coo indices, numpy's fancy index and pydict"""
    dic = {}

    def dic_update(key, val):
        if (dic.has_key(key)):
            dic[key] += val
        else:
            dic[key] = val

    mask = (coomat.col % 26 == 0)
    row = coomat.row[mask]
    data = coomat.data[mask]
    n = len(row)

    [dic_update(row[index], data[index]) for index in range(n)]
    return dic

def get_elapsed_time(func, listargs):
    """wrapper for calculating consuming time of function"""
    s = datetime.now()
    ret = func(*listargs)
    e = datetime.now()
    print("{0}\t{1}".format((e-s), func.__name__))
    return ret
     
def bench(M, N, R, L):
    column_indexes = frozenset([i for i in range(0, N, 26)])

    print("# lil_matrix")
    lilmat = gen_mat(M, N, R, format="lil")
    # get_elapsed_time(join_columns1, (lilmat, column_indexes)) # very slow!
    get_elapsed_time(join_columns2, (lilmat, column_indexes)) # bit slow

    print("# csc_matrix")
    cscmat = lilmat.tocsc()
    get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast

    print("# csr_matrix")
    csrmat = cscmat.tocsr()
    get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast

    print("# coo_matrix")
    coomat = cscmat.tocoo()
    get_elapsed_time(join_columns1, (cscmat, column_indexes)) # bit fast
    get_elapsed_time(join_columns3, (coomat, N))              # very fast

def main():
    M = 1400000      # n_row
    N = 50000        # n_col
    R = 0.0005       # sparsity
    L = int(N * 0.2) # column_indexes
 
    print("-------------------------------------")
    print("M:{0}, N:{1}, R:{2}, L:{3}".format(M, N, R, L))
    print("-------------------------------------")
    bench(M, N, R, L)
     
if __name__ == "__main__":
   main()
-------------------------------------
M:1400, N:5000, R:0.001, L:1000
-------------------------------------
1400x5000 matrix (sparsity:0.001, # of data:7000) is generated...
# lil_matrix
0:00:01.576205 join_columns1
0:00:00.044499 join_columns2
# csc_matrix
0:00:00.047233 join_columns1
# csr_matrix
0:00:00.047329 join_columns1
# coo_matrix
0:00:00.047277 join_columns1
0:00:00.000282 join_columns3

-------------------------------------
M:140000, N:50000, R:0.001, L:10000
-------------------------------------
140000x50000 matrix (sparsity:0.001, # of data:7000000) is generated...
# lil_matrix
0:00:03.003079 join_columns2
# csc_matrix
0:00:02.640237 join_columns1
# csr_matrix
0:00:02.640100 join_columns1
# coo_matrix
0:00:02.647828 join_columns1
0:00:00.300576 join_columns3

statistics tips: my covariance matrix implementation on python

#!/usr/bin/env python                                                                                                                
#coding: utf-8                                                                                                                       
                                                                                                                                     
import numpy as np                                                                                                                   
from numpy.random import *                                                                                                           
                                                                                                                                     
def get_covariance_matrix(A):                                                                                                        
    N, M = A.shape                                                                                                                   
    ret = np.reshape(np.zeros(N*M), (N,M))                                                                                           
                                                                                                                                     
    for n in range(0, N):                                                                                                            
        for m in range(0, M):                                                                                                        
            ret[n][m] = calc_covariance(A, n, m)                                                                                     
                                                                                                                                     
    return ret                                                                                                                       
                                                                                                                                     
def calc_covariance(A, n, m):                                                                                                        
    nominator = np.sum((A[n] - np.average(A[n])) * (A[m] - np.average(A[m])))                                                        
    denominator = A.shape[0] - 1                                                                                                     
    return nominator/denominator                                                                                                     
                                                                                                                                     
def test():                                                                                                                          
    D = 4                                                                                                                            
    A = np.reshape([randint(100) for i in range(D**2)], (D, D))                                                                      
                                                                                                                                     
    print("# numpy.cov() output:")                                                                                                   
    print(np.cov(A))                                                                                                                 
                                                                                                                                     
    print("# my covariance matrix:")                                                                                                 
    print(get_covariance_matrix(A))                                                                                                  
                                                                                                                                     
    print("# my precision matrix:")                                                                                                  
    print(get_covariance_matrix(A) ** -1)  
                                                                                                                                     
if __name__ == "__main__":                                                                                                           
    test()                             
yaboo@overtones:~$ python kyoubunsan.py
# numpy.cov() output:
[[  226.91666667   -26.83333333   -39.33333333   499.75      ]
 [  -26.83333333   328.33333333   629.          -257.83333333]
 [  -39.33333333   629.          1432.66666667  -882.        ]
 [  499.75        -257.83333333  -882.          1976.25      ]]                                                                      
# my covariance matrix:
[[  226.91666667   -26.83333333   -39.33333333   499.75      ]
 [  -26.83333333   328.33333333   629.          -257.83333333]
 [  -39.33333333   629.          1432.66666667  -882.        ]
 [  499.75        -257.83333333  -882.          1976.25      ]]
# my precision matrix:                                                                                                               
[[ 0.0044069  -0.03726708 -0.02542373  0.002001  ]
 [-0.03726708  0.00304569  0.00158983 -0.00387847]
 [-0.02542373  0.00158983  0.000698   -0.00113379]
 [ 0.002001   -0.00387847 -0.00113379  0.00050601]]

2012/05/22

algorithm tips: Sparse Matrix Multiplication Package (SMMP)

UNDERWRITING... Matrix operation is very useful for improving computing speed of our codes. For this operation, python has very popular packages such as numpy and scipy. In this article, I show sample code to calculate dot product using sparse matrix and vector with numpy and scipy.
import scipy
import numpy
from scipy.sparse import *
from numpy.random import *

vec = create_vec(N)
mat = create_mat(N,M)
ans = mat.dot(vec)
In above code, mat.dot(vec) calculates dot product. This method implements sparse matrix multiplication pacakage called SMMP which computational complexity is : O(n_row*K^2 + max(n_row,n_col))  where n_row is # of matrix rows, n_col is # of matrix columns, K is the maximum index for nonzero value in a row of A and column of B.

python tips: unpickling object is very slow because of garbage collection

In python, unpickling gigantic object from file is very much time consuming because of python's garbage collection. Python's garbage collection starts when cPickle loads gigantic object contains a lot of container object and number of container objects in the object exceed threshold of collect() method, python's garbage collection starts.
Now, assume you have 2GB pickle object which contains huge dictionary object in your file system and your python code load it. I compare loading time with/without garbage collection in python 2.7.2 on Ubuntu12.04.
#!/usr/local/bin/python                                                         
#coding: utf-8                                                                  
                                                                                
import cPickle as pickle                                                        
from datetime import datetime                                                   
import gc
                                                                                
F = file('gigantic_data.pickle', 'rb')                                          
s = datetime.now()                                                              
myobj = pickle.load(F)                                                          
e = datetime.now()                                                              
F.close()
                                                                                
print("load_pickle takes is {0}".(e-s))

# disabling cyclic garbage collection
gc.disable()

F = file('gigantic_data.pickle', 'rb')                                          
s = datetime.now()                                                              
myobj = pickle.load(F)                                                          
e = datetime.now()                                                              
F.close()

print("load_pickle takes is {0}".(e-s))
> /usr/local/bin/python above_script.py
load_pickle takes 0:15:16.530199
load_pickle takes 0:00:14.195409

2012/05/18

python tips: multiprocessing and shared memory

With multiprocessing package, it is extremely easy to make programs which share tasks with multiple workers(processes). This post is memo for how to share object which initialized by parent process with child process.

General way for sharing object between parent process and child processes is using Queue or Pipe. However, it's the easiest way for me to use global valuable. Below code is just a example of how to share object between processes using global valuable.


#!/usr/bin/env python
# coding:utf-8

import os
import multiprocessing

x = 10
y = 20
z = []
a = None
b = [1,2,3]
c = None

c = 3

def main():
    a = 1
    print("###############################")
    print("x:: {0}".format(print_value(x)))
    print("y:: {0}".format(print_value(y)))
    print("z:: {0}".format(print_value(z)))
    print("a:: {0}".format(print_value(a)))
    print("b:: {0}".format(print_value(b)))
    print("c:: {0}".format(print_value(c)))
    print("###############################")
    pool = multiprocessing.Pool(3)
    pool.map(func, [1,2,3], chunksize=1)
    

def print_value(val):
    return "{0}, id:{1}".format(val, id(val))
    
def func(val):
    print("pid: {0}, process_number: {1}".format(os.getpid(), val))
    print("x:: {0}".format(print_value(x)))
    print("y:: {0}".format(print_value(y)))
    print("z:: {0}".format(print_value(z)))    
    print("a:: {0}".format(print_value(a)))
    print("b:: {0}".format(print_value(b)))
    print("c:: {0}".format(print_value(c)))    

if __name__ == "__main__":
    main()

Here is output from above code. 'id' means memory-address pointed by valuable.
yaboo@ubuntu:~$ python shared_memory.py 
###############################
x:: 10, id:24606432
y:: 20, id:24606192
z:: [], id:139931380327848
a:: 1, id:24606648
b:: [1, 2, 3], id:139931380327920
c:: 4, id:24606576
###############################
pid: 3747, process_number: 1
x:: 10, id:24606432
pid: 3748, process_number: 2
y:: 20, id:24606192
x:: 10, id:24606432
z:: [], id:139931380327848
pid: 3749, process_number: 3
y:: 20, id:24606192
a:: None, id:9071744
x:: 10, id:24606432
z:: [], id:139931380327848
y:: 20, id:24606192
b:: [1, 2, 3], id:139931380327920
a:: None, id:9071744
c:: 3, id:24606600
z:: [], id:139931380327848
a:: None, id:9071744
b:: [1, 2, 3], id:139931380327920
b:: [1, 2, 3], id:139931380327920
c:: 3, id:24606600
c:: 3, id:24606600
yaboo@ubuntu:~$ 

2012/05/01

tips: port-forwarding with ssh

These are memos for usage of port forwarding with ssh command. First of all, you need to enable ssh port forwarding with below command.
ssh -f -N -L [#port]:[dest_hostname]:22 [user_name_for_proxy_host]@[proxy_hostname]
If you want to transfer some files use below commands.
scp -P [#port] [user_name_for_dest_host]@localhost:[filename] ./[filename]
scp -P [#port]  ./[filename] [user_name_for_dest_host]@localhost:[filename]
If you want to login with ssh, use below commands.
ssh -p [#port] [user_name_for_dest_host]@localhost

100