《利用python進行數據分析》——Chap4:numpy
本文為《利用python進行數據分析》內容的摘要及代碼實現,代碼可從github進行下載:
https://github.com/Ruiww/python_excercise/tree/master
Numpy的ndarray:一種多維數組對象
In [2]:
import numpy as np
In [5]:
#創建ndarray:np.array()-接受一切序列型對象 #所有元素的類型必須一致
data1 = [ 6,7.5,8,0,1 ] arr1 = np.array(data1) arr1
Out[5]:
array([ 6. , 7.5, 8. , 0. , 1. ])
In [6]:
#多維數組 data2 = [[1,2,3,4],[5,6,7,8]] arr2 = np.array(data2) arr2
Out[6]:
array([[1, 2, 3, 4],
[5, 6, 7, 8]])In [7]:
#維度ndim、各維度大小shape、類型dtype print(arr2.ndim) #2D print(arr2.shape) #2行4列 print(arr2.dtype) #int
2
(2, 4)int64In [11]:
#其他新建數組函數 zeroes/zeros_like/ones/ones_like/empty/empty_like/eye/identity np.zeros(10) np.zeros((2,3))
Out[11]:
array([[ 0., 0., 0.],
[ 0., 0., 0.]])In [12]:
np.ones((3,3))
Out[12]:
array([[ 1., 1., 1.],
[ 1., 1., 1.], [ 1., 1., 1.]])In [13]:
np.empty((2,2)) #empty創建無具體值的數組,內部數值為隨機值
Out[13]:
array([[ 6. , 7.5],
[ 8. , 1. ]])In [17]:
np.arange(15) #類比range()
Out[17]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
In [19]:
#創建NxN單位矩陣 np.identity(5) np.eye(5)
Out[19]:
array([[ 1., 0., 0., 0., 0.],
[ 0., 1., 0., 0., 0.], [ 0., 0., 1., 0., 0.], [ 0., 0., 0., 1., 0.],[ 0., 0., 0., 0., 1.]])
ndarray數據類型
In [22]:
#dtype arr = np.array([1,2,3],dtype = np.int32) arr
Out[22]:
array([1, 2, 3], dtype=int32)
In [25]:
arr.dtype
Out[25]:
dtype(int32)
In [30]:
#數組數據類型轉換,astype是複製出一個新的數組 float_arr = arr.astype(np.float64) float_arr.dtype
Out[30]:
dtype(float64)
數組和標量間的運算
數組中每個元素的批量運算(矢量化,vectortization)
In [31]:
arr = np.array([[1,2,3],[4,5,6]],dtype = np.float64)
In [33]:
arr
Out[33]:
array([[ 1., 2., 3.],
[ 4., 5., 6.]])
In [34]:
#相同大小數組的運算會應用到元素級 arr+arr
Out[34]:
array([[ 2., 4., 6.],
[ 8., 10., 12.]])In [35]:
arr-arr
Out[35]:
array([[ 0., 0., 0.],
[ 0., 0., 0.]])
In [36]:
arr*arr
Out[36]:
array([[ 1., 4., 9.],
[ 16., 25., 36.]])In [37]:
arr*2
Out[37]:
array([[ 2., 4., 6.],
[ 8., 10., 12.]])
In [38]:
1/arr
Out[38]:
array([[ 1. , 0.5 , 0.33333333],
[ 0.25 , 0.2 , 0.16666667]])In [39]:
arr**0.5
Out[39]:
array([[ 1. , 1.41421356, 1.73205081],
[ 2. , 2.23606798, 2.44948974]])基本的索引和切片
類比列表
In [41]:
arr = np.arange(10) arr
Out[41]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [44]:
arr[5] arr[3:6]
Out[44]:
array([3, 4, 5])
In [45]:
#標量值賦予切片,會自動傳播到整個選區,且修改會反映在源數組 arr[3:6] = 12 arr
Out[45]:
array([ 0, 1, 2, 12, 12, 12, 6, 7, 8, 9])
In [47]:
#二維數組索引 arr_2d = np.array([[1,2,3],[4,5,6]]) arr_2d
Out[47]:
array([[1, 2, 3],
[4, 5, 6]])In [52]:
#兩種索引方式效果相同 arr_2d[0][1] arr_2d[0,1]
Out[52]:
2
In [61]:
#高維數組 arr_3d = np.empty((2,3,2),dtype = np.int32) arr_3d
Out[61]:
array([[[ 0, 0],
[-716030302, 1073743866], [ 3, 0]], [[ 0, 0], [ 0, 0], [ 0, 196608]]], dtype=int32)In [62]:
arr_3d[0] #取出x = 0的部分,是y,z的二維數組
Out[62]:
array([[ 0, 0],
[-716030302, 1073743866], [ 3, 0]], dtype=int32)In [63]:
arr_3d[0,1] #x=0,y=1,獲得z的數組
Out[63]:
array([-716030302, 1073743866], dtype=int32)
In [70]:
#切片索引 arr_2d = np.array([[1,2,3],[4,5,6],[7,8,9]]) arr_2d
Out[70]:
array([[1, 2, 3],
[4, 5, 6], [7, 8, 9]])In [79]:
arr_2d[0:3,0:2] #取x=0,1,2/y=0,1的部分
Out[79]:
array([[1, 2],
[4, 5], [7, 8]])布爾型索引
In [82]:
names = np.array([Bob,Joe,Will,Bob,Will,Joe,Joe]) names == Bob #數組的比較運算會生成一個布爾型數組
Out[82]:
array([ True, False, False, True, False, False, False], dtype=bool)
In [87]:
data = np.random.randn(7,4) #生成隨機函數 data
Out[87]:
array([[ 0.30259674, 0.24208886, -0.9682166 , 0.40383172],
[ 0.5725992 , -0.12968448, 0.63810121, -0.2346277 ], [ 0.5073446 , 0.32387646, -0.60321438, 0.44078496], [-2.68397171, 0.07343136, -0.45605493, -0.40389433], [ 0.40076906, -0.35347494, -1.06051769, 1.00414024], [ 0.30175622, 1.54504734, -0.6097428 , 1.89325618], [ 1.78867169, -0.85173715, -1.13219709, -2.23488805]])In [88]:
#使用布爾數組進行數組索引 data[names==Bob] #獲得x=0,3的部分
Out[88]:
array([[ 0.30259674, 0.24208886, -0.9682166 , 0.40383172],
[-2.68397171, 0.07343136, -0.45605493, -0.40389433]])In [89]:
data[[0,3]] #同上
Out[89]:
array([[ 0.30259674, 0.24208886, -0.9682166 , 0.40383172],
[-2.68397171, 0.07343136, -0.45605493, -0.40389433]])In [91]:
data[names==Bob,:2]
Out[91]:
array([[ 0.30259674, 0.24208886],
[-2.68397171, 0.07343136]])In [98]:
mask = (names == Bob) | (names == Will) # &-與,|-或, 布爾型數組中and 和 or 無效 data[mask]
Out[98]:
array([[ 0.30259674, 0.24208886, -0.9682166 , 0.40383172],
[ 0.5073446 , 0.32387646, -0.60321438, 0.44078496], [-2.68397171, 0.07343136, -0.45605493, -0.40389433], [ 0.40076906, -0.35347494, -1.06051769, 1.00414024]])花式索引
指利用整數數組進行索引
In [101]:
arr = np.empty((8,4)) arr
Out[101]:
array([[ 0.00000000e+000, 2.00389640e+000, 2.21871213e-314,
2.21908720e-314], [ 2.21875063e-314, 2.21891739e-314, 2.21877679e-314, 2.21792386e-314], [ 2.21921089e-314, 2.21921118e-314, 2.21795516e-314, 2.21918651e-314], [ 2.21791444e-314, 2.21022229e-314, 2.20999900e-314, 2.21860965e-314], [ 2.21860986e-314, 2.21871237e-314, 2.21861692e-314, 2.21924649e-314], [ 2.21924659e-314, 2.21924687e-314, 2.21925541e-314, 2.21925560e-314], [ 2.21792421e-314, 2.21925462e-314, 2.21925706e-314, 2.21925519e-314], [ 2.21925481e-314, 2.21925506e-314, 2.21924706e-314, 2.22507608e-308]])In [103]:
for i in range(8): arr[i] = i arr
Out[103]:
array([[ 0., 0., 0., 0.],
[ 1., 1., 1., 1.], [ 2., 2., 2., 2.], [ 3., 3., 3., 3.], [ 4., 4., 4., 4.], [ 5., 5., 5., 5.], [ 6., 6., 6., 6.], [ 7., 7., 7., 7.]])In [104]:
#傳入指定順序的整數列表或ndarray,以特定順序選取子集 arr[[4,0,3,2]]
Out[104]:
array([[ 4., 4., 4., 4.],
[ 0., 0., 0., 0.], [ 3., 3., 3., 3.], [ 2., 2., 2., 2.]])數組轉置和軸對換
In [9]:
#轉置:返回源數據的視圖而不複製數據 arr = np.arange(15).reshape((3,5)) #1d轉2d arr
Out[9]:
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]])In [10]:
arr.T
Out[10]:
array([[ 0, 5, 10],
[ 1, 6, 11], [ 2, 7, 12], [ 3, 8, 13], [ 4, 9, 14]])In [11]:
np.dot(arr.T,arr) #np.dot(),數據點乘積
Out[11]:
array([[125, 140, 155, 170, 185],
[140, 158, 176, 194, 212], [155, 176, 197, 218, 239], [170, 194, 218, 242, 266], [185, 212, 239, 266, 293]])In [13]:
#transpose,使用軸編號的元組進行轉置 arr = np.arange(16).reshape((2,2,4)) arr
Out[13]:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]], [[ 8, 9, 10, 11], [12, 13, 14, 15]]])In [14]:
arr.transpose((1,0,2))
Out[14]:
array([[[ 0, 1, 2, 3],
[ 8, 9, 10, 11]], [[ 4, 5, 6, 7], [12, 13, 14, 15]]])In [15]:
#swapaxes(),交換一對軸 arr.swapaxes(1,2)
Out[15]:
array([[[ 0, 4],
[ 1, 5], [ 2, 6], [ 3, 7]], [[ 8, 12], [ 9, 13], [10, 14], [11, 15]]])通用函數
In [16]:
#元素級數組函數,類比excel函數
利用數組進行數據處理
In [18]:
points = np.arange(-5,5,0.01) points
In [39]:
xs,ys = np.meshgrid(points,points) #meshgrid生成網格型數據,接收兩個一維數組,生成2個二維數組
In [40]:
xs
Out[40]:
array([[-5. , -4.99, -4.98, ..., 4.97, 4.98, 4.99],
[-5. , -4.99, -4.98, ..., 4.97, 4.98, 4.99], [-5. , -4.99, -4.98, ..., 4.97, 4.98, 4.99], ..., [-5. , -4.99, -4.98, ..., 4.97, 4.98, 4.99], [-5. , -4.99, -4.98, ..., 4.97, 4.98, 4.99], [-5. , -4.99, -4.98, ..., 4.97, 4.98, 4.99]])In [41]:
ys
Out[41]:
array([[-5. , -5. , -5. , ..., -5. , -5. , -5. ],
[-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99], [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98], ..., [ 4.97, 4.97, 4.97, ..., 4.97, 4.97, 4.97], [ 4.98, 4.98, 4.98, ..., 4.98, 4.98, 4.98], [ 4.99, 4.99, 4.99, ..., 4.99, 4.99, 4.99]])In [42]:
import matplotlib.pyplot as plt z = np.sqrt(xs**2+ys**2) z
Out[42]:
array([[ 7.07106781, 7.06400028, 7.05693985, ..., 7.04988652,
7.05693985, 7.06400028], [ 7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815, 7.05692568], [ 7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354, 7.04985815], ..., [ 7.04988652, 7.04279774, 7.03571603, ..., 7.0286414 , 7.03571603, 7.04279774], [ 7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354, 7.04985815], [ 7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815, 7.05692568]])In [71]:
plt.imshow(z)
plt.colorbar()
plt.title(image plot of $sqrt{x^2+y^2}$ for a grid of values)
plt.show()
In [44]:
#使用數組運算表達條件邏輯 xarr = np.array([1.1,1.2,1.3,1.4,1.5]) yarr = np.array([2.1,2.2,2.3,2.4,2.5]) cond = np.array([True,False,True,True,False])
In [46]:
result = [(x if c else y) for x,y,c in zip(xarr,yarr,cond)] result
Out[46]:
[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]
In [49]:
result = np.where(cond,xarr,yarr) #適用於多維數組,且能夠實現嵌套用於複雜邏輯 result
Out[49]:
array([ 1.1, 2.2, 1.3, 1.4, 2.5])
數學和統計方法
In [51]:
#agggregation聚合運算(約減) arr = np.random.randn(5,4) arr
Out[51]:
array([[ 1.39833774, 0.43962417, 1.38977166, 0.70501487],
[ 1.30842151, 0.41674477, 0.62965793, -0.41984984], [-1.10573873, -0.37318421, 0.11682728, -0.03692315], [ 0.35539426, -0.50767123, 0.62759129, -1.27569057], [ 0.04332442, -0.1210773 , 0.50140486, 0.25657455]])In [52]:
arr.mean()
Out[52]:
0.21742771329571417
In [53]:
np.mean(arr)
Out[53]:
0.21742771329571417
In [59]:
arr.sum(axis=0)
Out[59]:
array([ 1.99973919, -0.1455638 , 3.26525303, -0.77087415])
In [63]:
arr = np.random.randn(100)
arr
Out[63]:
array([ 0.48262823, 0.511697 , 0.54549409, -0.77774644, -0.62340204,
-0.59429175, 0.11295305, -1.3854105 , 1.24462757, 1.06852993, 0.69304002, -2.64800882, 1.72916052, 0.12065841, -1.76143862, -2.03070374, -1.29571637, -0.60299982, -0.40474838, -0.92441152, 1.05309587, -1.43702212, 2.83354567, -0.43089056, 0.20115985, -1.79214718, 2.29808665, 0.53405151, -0.27378554, 3.21357331, 0.58016949, 0.92027828, 0.72865207, 1.12942179, -0.35213468, -0.88694468, 0.47956272, -1.16594112, -0.47908612, 2.63687567, -0.56193365, 0.11967331, 1.3895902 , -0.09599803, -0.44537606, -0.31143194, -0.72364893, 0.9595921 , -0.58358818, -1.33754472, 1.19690342, -0.57244989, 0.13698161, 0.8361684 , -0.86376946, 0.87973056, -0.89657187, 1.07735699, 0.3679787 , -0.38032608, -0.8713117 , -0.75827233, -1.50719385, -1.27138953, -0.0439037 , -2.01407532, -0.86676161, -0.55589103, 0.171621 , 0.16180338, 0.67421678, -0.0701451 , 1.2870454 , 1.03499482, -1.49585447, -0.06451731, 1.1988428 , -0.08072966, -1.77241565, 0.97360014, -1.1122304 , 0.54843454, -0.62752327, 0.07237886, 0.37113715, 0.95213627, -1.41293475, 0.34251529, -2.14819484, -0.38433564, 0.3345122 , 1.02517857, 0.0700207 , 0.4974566 , -0.15935018, 0.95854019, 0.01326315, 2.25932889, -1.66482643, 0.68546016])In [66]:
np.array(arr>0).sum() #布爾型數組的求和,計算TRUE的個數
Out[66]:
50
In [68]:
bools = np.array([True,True,False]) bools.any() #返回是否含有True
Out[68]:
True
In [70]:
bools.all() #是否全部是True
Out[70]:
False
排序
In [73]:
arr = np.random.randn(8) arr
Out[73]:
array([ 0.28476221, -1.23647742, -1.13330382, 0.77880175, 0.091851 ,
0.40025669, 0.2662308 , -1.06532011])In [75]:
arr.sort() #原位排序 arr
Out[75]:
array([-1.23647742, -1.13330382, -1.06532011, 0.091851 , 0.2662308 ,
0.28476221, 0.40025669, 0.77880175])In [78]:
arr = np.random.randn(5,3) arr
Out[78]:
array([[ 0.14725031, 0.95286709, 0.39007983],
[ 1.07037496, -0.97123275, -0.80741959], [ 1.81701563, -0.10686443, 0.65792766], [-1.30897912, 0.89118022, -1.15935802], [ 0.44042573, 0.81265977, -0.32377458]])In [79]:
arr.sort(1) #選定軸編號進行排序 arr
Out[79]:
array([[ 0.14725031, 0.39007983, 0.95286709],
[-0.97123275, -0.80741959, 1.07037496], [-0.10686443, 0.65792766, 1.81701563], [-1.30897912, -1.15935802, 0.89118022], [-0.32377458, 0.44042573, 0.81265977]])唯一化及其他集合邏輯
In [82]:
#np.unique() ints = np.array([1,1,43,53,4,2,2]) np.unique(ints)
Out[82]:
array([ 1, 2, 4, 43, 53])
In [83]:
#集合運算:intersect1d(x,y)/union1d(x,y)...... #集合交、並、差值等
線性代數
In [88]:
#np.linalg #numpy提供的數組運算函數 np.dot(arr.T,arr) #矩陣點積 arr*arr #元素級乘積,與dot矩陣乘積區分
Out[88]:
array([[ 0.02168265, 0.15216228, 0.9079557 ],
[ 0.94329305, 0.6519264 , 1.14570256], [ 0.01142001, 0.4328688 , 3.3015458 ], [ 1.71342634, 1.34411102, 0.79420219], [ 0.10482998, 0.19397482, 0.6604159 ]])隨機值生成
In [ ]:
#http://np.random.XXX
推薦閱讀:
※數據分析師經常用到哪些好用的數據處理工具?
※初探券商研報—基於網路爬蟲獲取2010-2016和訊網數據
※《R語言實戰》第四部分第十三章-廣義線性模型學習筆記
※大數據技術是否有可能拯救計劃經濟?
※美美的商務范兒——蝴蝶圖

