Abracadabra

python data analysis learning note Ch12

Numpy高级应用

1
2
3
4
5
6
from __future__ import division
from numpy.random import randn
from pandas import Series
import numpy as np
np.set_printoptions(precision=4)
import sys
1
2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

ndarray对象的内部机制

NumPy 数据类型体系

检测类型是否是某种类型的子类

1
2
3
4
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)
True






True

输出某种类型的所有父类

1
np.float64.mro()
[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

高级数组操作

数组重塑

1
2
3
arr = np.arange(8)
arr
arr.reshape((4, 2))
array([0, 1, 2, 3, 4, 5, 6, 7])






array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])
1
arr.reshape((4, 2)).reshape((2, 4))
array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

-1代表自动选择合适的维度

1
2
arr = np.arange(15)
arr.reshape((5, -1))
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

用其他数组的shape进行重塑

1
2
3
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)
(3, 5)






array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

拉直

1
2
3
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])






array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

会产生一个副本

1
arr.flatten()
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

C vs. Fortran 顺序

1
2
3
4
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])






array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])






array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

数组的合并以及拆分

1
2
3
4
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)
array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])






array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

更方便的方法

1
2
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))
array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])






array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])
1
2
3
4
5
6
7
from numpy.random import randn
arr = randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third
array([[ 0.9659,  1.3079],
       [-1.7632,  0.0904],
       [-0.6033,  0.2266],
       [-0.4417, -1.8609],
       [-1.2463, -0.6249]])






array([[ 0.9659,  1.3079]])






array([[-1.7632,  0.0904],
       [-0.6033,  0.2266]])






array([[-0.4417, -1.8609],
       [-1.2463, -0.6249]])

堆叠辅助类

更…简洁…

1
2
3
4
5
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]
array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [ 0.0376,  1.8236],
       [ 0.9025, -0.053 ],
       [-0.6849,  1.6728]])






array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 0.0376,  1.8236,  3.    ],
       [ 0.9025, -0.053 ,  4.    ],
       [-0.6849,  1.6728,  5.    ]])
1
np.c_[1:6, -10:-5]
array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

元素的重复操作: tile and repeat

元素级重复

1
2
arr = np.arange(3)
arr.repeat(3)
array([0, 0, 0, 1, 1, 1, 2, 2, 2])

指定重复次数

1
arr.repeat([2, 3, 4])
array([0, 0, 1, 1, 1, 2, 2, 2, 2])

多维数组需要指定axis

1
2
3
arr = randn(2, 2)
arr
arr.repeat(2, axis=0)
array([[-0.4628,  1.1142],
       [ 0.3637,  0.4341]])






array([[-0.4628,  1.1142],
       [-0.4628,  1.1142],
       [ 0.3637,  0.4341],
       [ 0.3637,  0.4341]])
1
2
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)
array([[-0.4628,  1.1142],
       [-0.4628,  1.1142],
       [ 0.3637,  0.4341],
       [ 0.3637,  0.4341],
       [ 0.3637,  0.4341]])






array([[-0.4628, -0.4628,  1.1142,  1.1142,  1.1142],
       [ 0.3637,  0.3637,  0.4341,  0.4341,  0.4341]])

块级重复

1
2
arr
np.tile(arr, 2)
array([[-0.4628,  1.1142],
       [ 0.3637,  0.4341]])






array([[-0.4628,  1.1142, -0.4628,  1.1142],
       [ 0.3637,  0.4341,  0.3637,  0.4341]])
1
2
3
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))
array([[-0.4628,  1.1142],
       [ 0.3637,  0.4341]])






array([[-0.4628,  1.1142],
       [ 0.3637,  0.4341],
       [-0.4628,  1.1142],
       [ 0.3637,  0.4341]])






array([[-0.4628,  1.1142, -0.4628,  1.1142],
       [ 0.3637,  0.4341,  0.3637,  0.4341],
       [-0.4628,  1.1142, -0.4628,  1.1142],
       [ 0.3637,  0.4341,  0.3637,  0.4341],
       [-0.4628,  1.1142, -0.4628,  1.1142],
       [ 0.3637,  0.4341,  0.3637,  0.4341]])

花式索引的等价函数: take and put

1
2
3
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]
array([700, 100, 200, 600])
1
2
3
4
5
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr
array([700, 100, 200, 600])






array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])






array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])
1
2
3
4
inds = [2, 0, 2, 1]
arr = randn(2, 4)
arr
arr.take(inds, axis=1)
array([[ 0.2772, -1.3059, -1.4607, -0.4856],
       [ 1.5585, -0.4521, -1.6259, -1.6644]])






array([[-1.4607,  0.2772, -1.4607, -1.3059],
       [-1.6259,  1.5585, -1.6259, -0.4521]])

广播

每一个元素都乘以4

1
2
3
arr = np.arange(5)
arr
arr * 4
array([0, 1, 2, 3, 4])






array([ 0,  4,  8, 12, 16])

每一维对应减去均值

1
2
3
4
5
arr = randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)
array([-0.1556,  0.3494, -0.2545])






array([[-0.3753,  0.5353,  1.3534],
       [-0.4282,  0.5606,  0.8935],
       [-0.0956, -0.9767, -1.2444],
       [ 0.899 , -0.1192, -1.0024]])






array([ -5.5511e-17,  -1.3878e-17,   0.0000e+00])
1
2
3
4
5
arr
row_means = arr.mean(1)
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)
array([[-0.5308,  0.8848,  1.0989],
       [-0.5837,  0.91  ,  0.639 ],
       [-0.2511, -0.6273, -1.4989],
       [ 0.7434,  0.2302, -1.2569]])






array([[ 0.4843],
       [ 0.3218],
       [-0.7924],
       [-0.0944]])






array([  7.4015e-17,   0.0000e+00,   0.0000e+00,   0.0000e+00])

沿其他轴向广播

维度不对应

1
arr - arr.mean(1)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-31-7b87b85a20b2> in <module>()
----> 1 arr - arr.mean(1)


ValueError: operands could not be broadcast together with shapes (4,3) (4,) 
1
arr - arr.mean(1).reshape((4, 1))
array([[-1.0151,  0.4005,  0.6146],
       [-0.9055,  0.5882,  0.3173],
       [ 0.5413,  0.1652, -0.7065],
       [ 0.8378,  0.3246, -1.1625]])
1
2
3
4
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis]
arr_3d
arr_3d.shape
array([[[ 0.,  0.,  0.,  0.]],

       [[ 0.,  0.,  0.,  0.]],

       [[ 0.,  0.,  0.,  0.]],

       [[ 0.,  0.,  0.,  0.]]])






(4, 1, 4)
1
2
3
4
arr_1d = np.random.normal(size=3)
arr_1d
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]
array([-1.1083,  0.5576,  1.2277])






array([[-1.1083],
       [ 0.5576],
       [ 1.2277]])






array([[-1.1083,  0.5576,  1.2277]])
1
2
3
4
5
6
arr = randn(3, 4, 5)
arr
depth_means = arr.mean(2)
depth_means
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)
array([[[-1.9966, -0.2431, -0.992 ,  0.8283, -0.5073],
        [-0.3938, -0.1332, -0.7427,  0.3094, -0.9241],
        [ 1.1069, -0.5383, -0.9288,  0.0233, -0.4678],
        [-1.2015,  0.6905,  1.6706, -0.1703, -1.3975]],

       [[-0.3048, -1.7181, -0.189 ,  0.6263,  1.1194],
        [ 0.0823, -0.7132, -0.5162,  1.5305, -1.199 ],
        [ 0.5777,  1.2935,  0.1547, -1.3637,  0.4251],
        [ 0.4923,  1.4004,  0.3646,  0.1594, -0.7334]],

       [[ 1.3836, -0.5313,  0.2826,  0.4739, -1.3435],
        [-1.141 , -0.3084,  1.1364,  1.1326,  0.3064],
        [-0.9692,  1.0229, -0.0246,  1.4484, -1.137 ],
        [ 1.7033, -1.8358,  1.2087, -0.5463,  0.5904]]])






array([[-0.5822, -0.3769, -0.1609, -0.0816],
       [-0.0932, -0.1631,  0.2174,  0.3367],
       [ 0.0531,  0.2252,  0.0681,  0.2241]])






array([[  8.8818e-17,   0.0000e+00,  -4.4409e-17,  -8.8818e-17],
       [  0.0000e+00,   0.0000e+00,   2.7756e-17,   8.8818e-17],
       [  4.4409e-17,   5.5511e-17,   4.4409e-17,   0.0000e+00]])
1
2
3
4
5
6
7
def demean_axis(arr, axis=0):
means = arr.mean(axis)
# This generalized things like [:, :, np.newaxis] to N dimensions
indexer = [slice(None)] * arr.ndim # like :
indexer[axis] = np.newaxis
return arr - means[indexer]

通过广播设置数组的值

1
2
3
arr = np.zeros((4, 3))
arr[:] = 5
arr
array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])
1
2
3
4
5
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr
array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])






array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

ufunc高级应用

ufunc实例方法

reduce通过一系列的二元运算对其值进行聚合(可指明轴向)

1
2
3
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()
45






45
1
np.random.seed(12346)

这里聚合的是逻辑与操作

1
2
3
4
5
6
arr = randn(5, 5)
arr
arr[::2].sort(1) # sort a few rows
arr
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)
array([[-0.7066,  0.4268, -0.2776, -0.8283, -2.7628],
       [ 0.9835,  0.4378, -0.8496,  0.7188,  0.7329],
       [ 0.5047, -0.7893,  0.5392,  1.2907,  0.8676],
       [ 0.4113,  0.4459, -0.3172, -1.0493,  1.3459],
       [ 0.356 , -0.0915, -0.535 , -0.036 , -0.2591]])






array([[-2.7628, -0.8283, -0.7066, -0.2776,  0.4268],
       [ 0.9835,  0.4378, -0.8496,  0.7188,  0.7329],
       [-0.7893,  0.5047,  0.5392,  0.8676,  1.2907],
       [ 0.4113,  0.4459, -0.3172, -1.0493,  1.3459],
       [-0.535 , -0.2591, -0.0915, -0.036 ,  0.356 ]])






array([[ True,  True,  True,  True],
       [False, False,  True,  True],
       [ True,  True,  True,  True],
       [ True, False, False,  True],
       [ True,  True,  True,  True]], dtype=bool)






array([ True, False,  True, False,  True], dtype=bool)

相对于reduce只输出最后结果,accumulate保留中间结果

1
2
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)
array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

outer计算两个数组的叉积

1
2
3
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))
array([0, 1, 1, 2, 2])






array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

outer输出结果的维度是输入两个数组的维度之和

1
2
result = np.subtract.outer(randn(3, 4), randn(5))
result.shape
(3, 4, 5)
1
2
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])
array([10, 18, 17], dtype=int32)
1
2
3
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)
array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])






array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]], dtype=int32)

自定义 ufuncs

两种不同的调用方式

1
2
3
4
def add_elements(x, y):
return x + y
add_them = np.frompyfunc(add_elements, 2, 1) # 2 input and 1 output
add_them(np.arange(8), np.arange(8))
array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)
1
2
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))
array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.])

自己实现的还是比不上内置优化过的函数

1
2
3
arr = randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)
100 loops, best of 3: 1.81 ms per loop
The slowest run took 16.51 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3.65 µs per loop

结构化和记录式数组

1
2
3
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr
array([(1.5, 6), (3.141592653589793, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])
1
2
sarr[0]
sarr[0]['y']
(1.5, 6)






6
1
sarr['x']
array([ 1.5   ,  3.1416])

嵌套dtype和多维字段

1
2
3
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr
array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)], 
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])
1
arr[0]['x']
array([0, 0, 0], dtype=int64)
1
arr['x']
array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]], dtype=int64)
1
2
3
4
5
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']
array([(1.0, 2.0), (3.0, 4.0)], 
      dtype=[('a', '<f8'), ('b', '<f4')])






array([5, 6])






array([ 1.,  3.])

更多有关排序的话题

1
2
3
arr = randn(6)
arr.sort()
arr
array([-1.3918, -0.2089,  0.2316,  0.728 ,  0.8356,  1.9956])
1
2
3
4
arr = randn(3, 5)
arr
arr[:, 0].sort() # Sort first column values in-place
arr
array([[ -2.9812e-01,   1.2037e+00,  -1.5768e-02,   7.4395e-01,
          8.6880e-01],
       [ -4.2865e-01,   7.1886e-01,  -1.4510e+00,   1.0510e-01,
         -1.7942e+00],
       [ -2.8792e-04,   6.1168e-01,  -9.1210e-02,  -1.2799e+00,
         -4.0230e-02]])






array([[ -4.2865e-01,   1.2037e+00,  -1.5768e-02,   7.4395e-01,
          8.6880e-01],
       [ -2.9812e-01,   7.1886e-01,  -1.4510e+00,   1.0510e-01,
         -1.7942e+00],
       [ -2.8792e-04,   6.1168e-01,  -9.1210e-02,  -1.2799e+00,
         -4.0230e-02]])
1
2
3
4
arr = randn(5)
arr
np.sort(arr)
arr
array([-0.9699, -0.5626,  1.1172,  0.2791, -1.1148])






array([-1.1148, -0.9699, -0.5626,  0.2791,  1.1172])






array([-0.9699, -0.5626,  1.1172,  0.2791, -1.1148])
1
2
3
4
arr = randn(3, 5)
arr
arr.sort(axis=1)
arr
array([[ 0.2266,  0.3405,  2.6439, -1.6262, -0.3976],
       [-1.4821,  1.068 , -0.252 , -0.9331,  2.2639],
       [-0.2311,  1.1472,  0.9287, -0.9023,  1.1761]])






array([[-1.6262, -0.3976,  0.2266,  0.3405,  2.6439],
       [-1.4821, -0.9331, -0.252 ,  1.068 ,  2.2639],
       [-0.9023, -0.2311,  0.9287,  1.1472,  1.1761]])
1
arr[:, ::-1]
array([[ 2.6439,  0.3405,  0.2266, -0.3976, -1.6262],
       [ 2.2639,  1.068 , -0.252 , -0.9331, -1.4821],
       [ 1.1761,  1.1472,  0.9287, -0.2311, -0.9023]])

间接排序: argsort and lexsort

1
2
3
4
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]
array([1, 2, 4, 3, 0], dtype=int64)






array([0, 1, 2, 3, 5])
1
2
3
4
arr = randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]
array([[ 5.    ,  0.    ,  1.    ,  3.    ,  2.    ],
       [ 0.422 ,  0.1187,  1.1352,  1.4363, -1.2487],
       [ 0.1909, -1.0984,  0.7886, -0.5827,  1.1592]])






array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [ 0.1187,  1.1352, -1.2487,  1.4363,  0.422 ],
       [-1.0984,  0.7886,  1.1592, -0.5827,  0.1909]])
1
2
3
4
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
zip(last_name[sorter], first_name[sorter])
<zip at 0x1d1284f87c8>

其他排序算法

1
2
3
4
5
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)
array([2, 3, 4, 0, 1], dtype=int64)






array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='<U8')

numpy.searchsorted: 在有序数组中查找元素

1
2
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)
3
1
arr.searchsorted([0, 8, 11, 16])
array([0, 3, 3, 5], dtype=int64)
1
2
3
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')
array([0, 3], dtype=int64)






array([3, 7], dtype=int64)
1
2
3
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data
array([  143.,  8957.,   309.,  2349.,  5503.,  2754.,  4408.,  4259.,
        3313.,  3364.,  2492.,  9977.,  4704.,  5538.,  6089.,  5864.,
        6926.,  3677.,  8698.,  1832.,  8931.,  6631.,  5322.,  3712.,
        9350.,  3945.,  9514.,  3683.,  8568.,  8247.,  7087.,  7630.,
        3392.,  8320.,  1973.,   982.,  1672.,  7052.,  6230.,  3894.,
        1832.,  9488.,   755.,  8522.,  1858.,  5417.,  6162.,  7517.,
        9827.,  4458.])
1
2
labels = bins.searchsorted(data)
labels
array([2, 4, 2, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 4, 4, 4,
       3, 4, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 2, 3, 4, 4, 3, 3, 4, 2, 4, 3, 4,
       4, 4, 4, 3], dtype=int64)
1
Series(data).groupby(labels).mean()
2     547.250000
3    3178.550000
4    7591.038462
dtype: float64
1
np.digitize(data, bins)
array([2, 4, 2, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 4, 4, 4,
       3, 4, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 2, 3, 4, 4, 3, 3, 4, 2, 4, 3, 4,
       4, 4, 4, 3], dtype=int64)

NumPy matrix class

1
2
3
4
5
6
7
8
X = np.array([[ 8.82768214, 3.82222409, -1.14276475, 2.04411587],
[ 3.82222409, 6.75272284, 0.83909108, 2.08293758],
[-1.14276475, 0.83909108, 5.01690521, 0.79573241],
[ 2.04411587, 2.08293758, 0.79573241, 6.24095859]])
X[:, 0] # one-dimensional
y = X[:, :1] # two-dimensional by slicing
X
y
array([ 8.8277,  3.8222, -1.1428,  2.0441])






array([[ 8.8277,  3.8222, -1.1428,  2.0441],
       [ 3.8222,  6.7527,  0.8391,  2.0829],
       [-1.1428,  0.8391,  5.0169,  0.7957],
       [ 2.0441,  2.0829,  0.7957,  6.241 ]])






array([[ 8.8277],
       [ 3.8222],
       [-1.1428],
       [ 2.0441]])
1
np.dot(y.T, np.dot(X, y))
array([[ 1195.468]])
1
2
3
4
5
Xm = np.matrix(X)
ym = Xm[:, 0]
Xm
ym
ym.T * Xm * ym
matrix([[ 8.8277,  3.8222, -1.1428,  2.0441],
        [ 3.8222,  6.7527,  0.8391,  2.0829],
        [-1.1428,  0.8391,  5.0169,  0.7957],
        [ 2.0441,  2.0829,  0.7957,  6.241 ]])






matrix([[ 8.8277],
        [ 3.8222],
        [-1.1428],
        [ 2.0441]])






matrix([[ 1195.468]])
1
Xm.I * X
matrix([[  1.0000e+00,   6.9616e-17,  -4.0136e-17,   8.1258e-17],
        [ -2.3716e-17,   1.0000e+00,   2.2230e-17,  -2.5721e-17],
        [  1.0957e-16,   5.0783e-18,   1.0000e+00,   7.8658e-18],
        [ -5.7092e-17,  -3.7777e-18,   6.2391e-18,   1.0000e+00]])

高级数组输入输出

内存映像文件

1
2
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap
memmap([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
1
section = mmap[:5]
1
2
3
4
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap
memmap([[-1.273 , -0.1547,  0.7817, ...,  0.3421,  1.0272, -1.8742],
       [-0.3544, -3.1195,  0.1256, ..., -0.4476,  0.4863, -0.8311],
       [-1.1117,  0.8186,  2.3934, ...,  0.1061,  1.4123,  0.6489],
       ..., 
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])
1
2
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap
memmap([[-1.273 , -0.1547,  0.7817, ...,  0.3421,  1.0272, -1.8742],
       [-0.3544, -3.1195,  0.1256, ..., -0.4476,  0.4863, -0.8311],
       [-1.1117,  0.8186,  2.3934, ...,  0.1061,  1.4123,  0.6489],
       ..., 
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])
1
2
%xdel mmap
!del mymmap
NameError: name 'mmap' is not defined
C:\Users\Ewan\Downloads\pydata-book-master\mymmap


The process cannot access the file because it is being used by another process.

性能建议

连续内存的重要性

1
2
3
4
5
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous
  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False






  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False






True
1
2
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)
1000 loops, best of 3: 848 µs per loop
1000 loops, best of 3: 582 µs per loop
1
arr_f.copy('C').flags
C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
UPDATEIFCOPY : False
1
2
arr_c[:50].flags.contiguous
arr_c[:, :50].flags
True






  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False
1
2
3
%xdel arr_c
%xdel arr_f
%cd ..
C:\Users\Ewan\Downloads

其他加速手段: Cython, f2py, C

1
2
3
4
5
6
7
8
9
10
from numpy cimport ndarray, float64_t
def sum_elements(ndarray[float64_t] arr):
cdef Py_ssize_t i, n = len(arr)
cdef float64_t result = 0
for i in range(n):
result += arr[i]
return result