2012/08/23

R tips: operating probability density function in R

There are several probability density functions (PDFs) defined in R. You can find defined PDFs with "library(help='stats')" in R-console. Now, let's plot Gaussian distribution N(0,1) with *norm function.
# Probability density function
> curve(dnorm(x, mean=0, sd=1, log=FALSE), -3, 3, n=1001, col = "blue")           
# Cumulative probability density function
> curve(pnorm(x, mean=0, sd=1, log=FALSE), -3, 3, n=1001, col = "blue")           
# Probabilistic generative model
> rnorm(n=100, mean=0, sd=1) 
  [1]  0.76291389  0.32249468  1.43329768  0.39455640 -0.96082434 -0.03498441
  [7] -0.98776053 -0.69866914  1.31514002 -1.42647303 -0.46658770 -1.32506735
 [13] -1.52074716  0.45169290 -1.11993007 -0.27547321  0.86232800 -2.36687435
 [19] -1.34228574 -0.72800613 -0.41979553 -2.24104445  1.86935264 -0.88038071
 [25] -0.71816560 -0.53788946  0.50974550 -1.23530999  0.77035374  0.06483997
 [31] -0.62251296  1.60636131  0.36695288  0.76654543  0.86672801 -0.75831284
 [37] -1.48413034 -0.37661399  1.34613939 -1.20912506 -0.22231654  2.66835415
 [43]  0.49528498  0.26435606 -0.74862141 -0.41349885 -0.94850251  0.83038086
 [49]  0.29994578 -0.15778865 -1.16049616  2.24109521  0.86357121  1.28825845
 [55]  0.50549591  0.08668917 -0.34392882  2.08620620 -0.02514741 -0.38974372
 [61] -0.07943300  1.01109304  1.26749487  1.65848787  1.40626520  0.13476383
 [67] -0.81541581 -0.23397894 -1.54089193  2.69271063  1.02584790  0.34583390
 [73]  0.35808556 -2.23486925 -0.97331144 -0.75742258 -1.49065873 -0.24702416
 [79]  1.46008300  0.26804260  1.25457656  0.55169794 -0.34509144 -1.24602872
 [85]  0.65070530 -0.69068100  0.98802706 -1.76179226 -0.30577751  0.30197949
 [91] -0.18236187 -0.86313716  1.36986087 -1.23400546 -0.78780711 -2.14942716
 [97]  1.17743473 -0.95787888 -0.27079632  1.79351280
          

2012/08/22

geometry and python tips: automatic generation of Japanese standardized regional meshes

Huge number of geometric object takes time to load (Especially, loading standardized regional meshes are time consuming). Therefore, I implemented standardized regional mesh generator which generates meshes according to input boundary box.
#!/usr/bin/env python
#coding: utf-8

import sys
import datetime

from rtree import index
from rtree.index import Rtree
from shapely.geometry import Polygon

class SrmGenerator():
    def get_mesh1st_to_polygon(self, bbox):
        return dict([(mesh1st, Polygon(self._get_rect(*bbox)))
                     for mesh1st, bbox in self.generate_mesh1st_to_bbox(bbox).items()])

    def generate_mesh1st_to_bbox(self, bbox):
        d_mesh1st_to_bbox = self._get_d_mesh1st_to_bbox()
        rindex = self._create_rtree_index(d_mesh1st_to_bbox)
        return dict([(mesh1st, d_mesh1st_to_bbox[mesh1st])
                     for mesh1st in list(rindex.intersection(bbox))])

    def get_mesh2nd_to_polygon(self, bbox):
        return dict([(mesh2nd, Polygon(self._get_rect(*bbox)))
                     for mesh2nd, bbox in self.generate_mesh2nd_to_bbox(bbox).items()])

    def generate_mesh2nd_to_bbox(self, bbox):
        d_mesh1st_to_bbox = self.generate_mesh1st_to_bbox(bbox)
        d_mesh2nd_to_bbox = self._get_d_mesh2nd_to_bbox(d_mesh1st_to_bbox)
        rindex = self._create_rtree_index(d_mesh2nd_to_bbox)
        return dict([(mesh2nd, d_mesh2nd_to_bbox[mesh2nd])
                     for mesh2nd in list(rindex.intersection(bbox))])

    def get_mesh3rd_to_polygon(self, bbox):
        return dict([(mesh3rd, Polygon(self._get_rect(*bbox)))
                     for mesh3rd, bbox in self.generate_mesh3rd_to_bbox(bbox).items()])

    def generate_mesh3rd_to_bbox(self, bbox):
        d_mesh2nd_to_bbox = self.generate_mesh2nd_to_bbox(bbox)
        d_mesh3rd_to_bbox = self._get_d_mesh3rd_to_bbox(d_mesh2nd_to_bbox)
        rindex = self._create_rtree_index(d_mesh3rd_to_bbox)
        return dict([(mesh3rd, d_mesh3rd_to_bbox[mesh3rd])
                     for mesh3rd in list(rindex.intersection(bbox))])

    def get_mesh4th_to_polygon(self, bbox):
        return dict([(mesh4th, Polygon(self._get_rect(*bbox)))
                     for mesh4th, bbox in self.generate_mesh4th_to_bbox(bbox).items()])

    def generate_mesh4th_to_bbox(self, bbox):
        d_mesh3rd_to_bbox = self.generate_mesh3rd_to_bbox(bbox)
        d_mesh4th_to_bbox = self._get_d_mesh4th_to_bbox(d_mesh3rd_to_bbox)
        rindex = self._create_rtree_index(d_mesh4th_to_bbox)
        return dict([(mesh4th, d_mesh4th_to_bbox[mesh4th])
                     for mesh4th in list(rindex.intersection(bbox))])

    def _create_rtree_index(self, d_meshid_to_bbox):
        p = index.Property()
        rindex = index.Index(properties=p)
        [rindex.add(meshid, bbox) for meshid, bbox in d_meshid_to_bbox.items()]
        return rindex

    def _get_d_mesh1st_to_bbox(self):
        l_x = [str(x).zfill(2) for x in range(20, 50)]
        l_y = [str(y).zfill(2) for y in range(30, 70)]
        return  dict([self._generate_mesh1st_bbox_pair(x, y)
                      for x, y in self._get_combinations(l_x, l_y)])
        
    def _generate_mesh1st_bbox_pair(self, x, y):
        id_mesh1st = int("{0}{1}".format(y, x))
        left = float("1{0}".format(x))
        bottom = float(y) / 1.5
        right = float("1{0}".format(str(int(x) + 1)))
        top = float(int(y) + 1) / 1.5
        return (id_mesh1st, (left, bottom, right, top))

    def _get_d_mesh2nd_to_bbox(self, d_mesh1st_to_bbox, num_split=8):
        fn = self._get_meshid_for_2nd_3rd
        return self._get_d_meshid_to_bbox(d_mesh1st_to_bbox, num_split, fn)

    def _get_d_mesh3rd_to_bbox(self, d_mesh2nd_to_bbox, num_split=10):
        fn = self._get_meshid_for_2nd_3rd
        return self._get_d_meshid_to_bbox(d_mesh2nd_to_bbox, num_split, fn)

    def _get_d_mesh4th_to_bbox(self, d_mesh3rd_to_bbox, num_split=2):
        fn = self._get_meshid_for_4th
        return self._get_d_meshid_to_bbox(d_mesh3rd_to_bbox, num_split, fn)

    def _get_d_meshid_to_bbox(self, d_meshcode_to_bbox, num_split, fn):
        D = {}
        for meshcode, bbox in d_meshcode_to_bbox.items():
            left, bottom, right, top = bbox
            x_delta = (right - left) / num_split
            y_delta = (top - bottom) / num_split
            x_coord = [left] + [left+x_delta*i for i in range(1, num_split)] + [right]
            y_coord = [bottom] + [bottom+y_delta*i for i in range(1, num_split)] + [top]
            for i, j in self._get_combinations(range(len(y_coord)-1), range(len(x_coord)-1)):
                new_meshcode = fn(meshcode, i, j)
                bbox = (x_coord[j], y_coord[i], x_coord[j+1], y_coord[i+1])
                D.update({new_meshcode: bbox})
        return D

    def _get_meshid_for_2nd_3rd(self, meshcode, i, j):
        return int(str(meshcode) + str(i) + str(j))

    def _get_meshid_for_4th(self, meshcode, i, j):
        return int(str(meshcode) + self._get_mesh4th_last_code(i, j))

    def _get_mesh4th_last_code(self, y, x):
        if (y == 0 and x == 0):
            return '1'
        elif (y == 0 and x == 1):
            return '2'
        elif (y == 1 and x == 0):
            return '3'
        elif (y == 1 and x == 1):
            return '4'
        else:
            print "unexpected error in _get_mesh4th_last_code"
            sys.exit(0)

    def _get_combinations(self, L1, L2):
        ret = []
        for e1 in L1 :
            for e2 in L2:
                ret.append([e1, e2])
        return ret

    def _get_rect(self, left, bottom, right, top):
        return ((left, bottom),
                (left, top),
                (right, top),
                (right, bottom),
                (left, bottom))
                
if __name__ == "__main__":
    L = 130.0
    R = 140.0
    B = 34.0
    T = 34.2
    bbox = (L, B, R, T)

    from datetime import datetime
    s = datetime.now()
    print len(SrmGenerator().get_mesh4th_to_polygon(bbox))
    e = datetime.now()
    print(e-s)
$ python generate_area_srg.py
80100
0:00:07.602852

geometry and python tips: How to load shapefile correctly using python?

Shapely provides an easy way to operate geometrical objects. However, shapely does not have good functionality for loading ESRI shapefile directly and formatting topologically complex geometry for making shapely's method arguments is tired work. Below code will solve this issue: combination of shapely.wkb.loads and osgeo library. To write this code, I referred here.
#!/usr/bin/env python
#coding: utf-8

import sys
import cPickle as pickle
import collections

from osgeo import ogr
from shapely.wkb import loads

def main():
    path_to_shp = argvs[1]
    path_to_out = argvs[2]

    d_fid2pol = get_d_fid2pol(path_to_shp)
    dump_object_to_file(d_fid2pol, path_to_out)

def get_d_fid2pol(path_to_shp):
    """
    :param path_to_shp: path to *.shp
    :type path_to_shp: string

    :returns: {fid: MultiPolygon(), ...}
    :rtype: dictionary
    """
    data = ogr.Open(path_to_shp)
    elements = data.GetLayer()
    return dict([get_fid_and_poly(element) for element in elements])

def get_fid_and_poly(element):
    """
    :param element: element of shapefile
    :type element: object

    :returns: {fid: MultiPolygon()}
    :rtype: element of dictionary
    """
    fid = element.GetField(0)
    poly = loads(element.GetGeometryRef().ExportToWkb())
    return (fid, poly)
    
def dump_object_to_file(obj, path_to_out):
    with open(path_to_out, 'wb') as F:
        pickle.dump(obj, F, -1)

if __name__ == "__main__":
    main()

2012/08/21

R tips: first contact with linear pattern recognition in R.

This is just test code for linear pattern recognition. Below code is introduced in http://www.slideshare.net/sleipnir002/05-12739580.
> index <- sample(nrow(data), nrow(data)*0.2) #sample data
> index
 [1]  83  47 146  59 111  94 121  14  15 122  33  22  87  78  66 133  18  79  19
[20]   9  88  32 116  60   7 104  67  20  86   1
> data[index,]
    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species Species2
83           5.8         2.7          3.9         1.2 versicolor   vervir
47           5.1         3.8          1.6         0.2     setosa   setosa
146          6.7         3.0          5.2         2.3  virginica   vervir
59           6.6         2.9          4.6         1.3 versicolor   vervir
111          6.5         3.2          5.1         2.0  virginica   vervir
94           5.0         2.3          3.3         1.0 versicolor   vervir
121          6.9         3.2          5.7         2.3  virginica   vervir
14           4.3         3.0          1.1         0.1     setosa   setosa
15           5.8         4.0          1.2         0.2     setosa   setosa
122          5.6         2.8          4.9         2.0  virginica   vervir
33           5.2         4.1          1.5         0.1     setosa   setosa
22           5.1         3.7          1.5         0.4     setosa   setosa
87           6.7         3.1          4.7         1.5 versicolor   vervir
78           6.7         3.0          5.0         1.7 versicolor   vervir
66           6.7         3.1          4.4         1.4 versicolor   vervir
133          6.4         2.8          5.6         2.2  virginica   vervir
18           5.1         3.5          1.4         0.3     setosa   setosa
79           6.0         2.9          4.5         1.5 versicolor   vervir
19           5.7         3.8          1.7         0.3     setosa   setosa
9            4.4         2.9          1.4         0.2     setosa   setosa
88           6.3         2.3          4.4         1.3 versicolor   vervir
32           5.4         3.4          1.5         0.4     setosa   setosa
116          6.4         3.2          5.3         2.3  virginica   vervir
60           5.2         2.7          3.9         1.4 versicolor   vervir
7            4.6         3.4          1.4         0.3     setosa   setosa
104          6.3         2.9          5.6         1.8  virginica   vervir
67           5.6         3.0          4.5         1.5 versicolor   vervir
20           5.1         3.8          1.5         0.3     setosa   setosa
86           6.0         3.4          4.5         1.6 versicolor   vervir
1            5.1         3.5          1.4         0.2     setosa   setosa
> res <- glm(Species~.,iris[index,], family=binomial) #linear regression
 警告メッセージ: 
 glm.fit: 数値的に 0 か 1 である確率が生じました  
> res$coefficient
 (Intercept) Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    6.587948    -7.617771   -13.925058    36.005598   -31.809827 
> logLik(res) #log-scale likelyhood
'log Lik.' -1.785923e-10 (df=5)
> predict(res,iris,type="response") # predictions from fitting functions
           1            2            3            4            5            6 
2.220446e-16 4.328295e-13 2.220446e-16 3.870520e-11 2.220446e-16 2.220446e-16 
           7            8            9           10           11           12 
2.220446e-16 2.220446e-16 7.856922e-11 9.478634e-11 2.220446e-16 4.737275e-12 
          13           14           15           16           17           18 
2.231699e-11 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
          19           20           21           22           23           24 
2.220446e-16 2.220446e-16 1.795636e-12 2.220446e-16 2.220446e-16 2.220446e-16 
          25           26           27           28           29           30 
2.326153e-07 2.709491e-10 2.220446e-16 2.220446e-16 2.220446e-16 1.643915e-10 
          31           32           33           34           35           36 
3.088882e-10 2.220446e-16 2.220446e-16 2.220446e-16 3.937881e-12 2.220446e-16 
          37           38           39           40           41           42 
2.220446e-16 2.220446e-16 5.330791e-13 2.220446e-16 2.220446e-16 1.769196e-10 
          43           44           45           46           47           48 
2.220446e-16 2.220446e-16 1.556449e-13 2.220446e-16 2.220446e-16 2.626083e-13 
          49           50           51           52           53           54 
2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          55           56           57           58           59           60 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          61           62           63           64           65           66 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 9.999926e-01 1.000000e+00 
          67           68           69           70           71           72 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          73           74           75           76           77           78 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          79           80           81           82           83           84 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          85           86           87           88           89           90 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          91           92           93           94           95           96 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
          97           98           99          100          101          102 
1.000000e+00 1.000000e+00 9.974026e-01 1.000000e+00 1.000000e+00 1.000000e+00 
         103          104          105          106          107          108 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         109          110          111          112          113          114 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         115          116          117          118          119          120 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         121          122          123          124          125          126 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         127          128          129          130          131          132 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         133          134          135          136          137          138 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         139          140          141          142          143          144 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         145          146          147          148          149          150 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
> levels(data$Species2)
[1] "setosa" "vervir"
> yhat <- levels(data$Species2)[(predict(res,iris,type="response")>0.5)+1]
> yhat
  [1] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
  [9] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
 [17] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
 [25] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
 [33] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
 [41] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
 [49] "setosa" "setosa" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
 [57] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
 [65] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
 [73] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
 [81] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
 [89] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
 [97] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
[105] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
[113] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
[121] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
[129] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
[137] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
[145] "vervir" "vervir" "vervir" "vervir" "vervir" "vervir"
> mean(yhat[index] != data$Species2[index]) #training error
[1] 0
> mean(yhat[-index] != data$Species2[-index]) #estimation error
[1] 0
> table(true=data$Species2,prediction=yhat) #show result
        prediction
true     setosa vervir
  setosa     50      0
  vervir      0    100

R tips: inserting factor value into data frame

Here is sample R script for inserting factor value into data frame.
> data <- data.frame(iris)
> cind <- length(data) + 1
> data[data$Species == 'setosa', cind] <- c('setosa')
> data[data$Species != 'setosa', cind] <- c('vervir')
> data[,cind] <- as.factor(data[,cind])
> names(data)
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
[6] "V6"          
> names(data)[6]
[1] "V6"
> names(data)[6] <- c('Species2')

2012/08/15

geometry tips: generates identifier of standard grid cell which covers Japan region from latitude and longitude.

In geometric operation with python, sometimes we map point (which is latitude and longitude pair) to corresponding standard grid cell. Here is sample code for mapping point to identifier of the cell.
#!/usr/bin/env python
#coding: utf-8

def get_meshcode_from_latlon(lat, lon):
    lat, lon = lat * 3600, lon * 3600

    l_code = []

    l_code += str(int(lat/2400+100))[1:3]
    l_code += str(int(lon/3600+100))[1:3]
    t_lat, t_lon = int(lat%2400), int(lon%3600)

    l_code += str(t_lat/300)
    l_code += str(t_lon/450)
    t_lat, t_lon = t_lat % 300, t_lon % 450

    l_code += str(t_lat/30)
    l_code += str(t_lon/45)
    t_lat, t_lon = t_lat % 30, t_lon % 45

    if (t_lat < 15):
        if (t_lon < 23):
            l_code += '1'
        else:
            l_code += '2'
    else:
        if (t_lat < 23):
            l_code += '3'
        else:
            l_code += '4'

    return ('').join(l_code)

if __name__ == "__main__":
    lat = 34.12345
    lon = 139.12345
    print get_meshcode_from_latlon(lat, lon)

    lat = 30.12345
    lon = 130.12345
    print get_meshcode_from_latlon(lat, lon)

100