版權宣告:本套技術專欄是作者(秦凱新)平時工作的總結和昇華,通過從真實商業環境抽取案例進行總結和分享,並給出商業應用的調優建議和叢集環境容量規劃等內容,請持續關注本套部落格。QQ郵箱地址:1120746959@qq.com,如有任何學術交流,可隨時聯絡。
1 Numpy詳細使用
-
讀取txt檔案
import numpy world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",") print(type(world_alcohol)) world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",", dtype="U75", skip_header=1) print(world_alcohol) [[u'1986' u'Western Pacific' u'Viet Nam' u'Wine' u'0'] [u'1986' u'Americas' u'Uruguay' u'Other' u'0.5'] [u'1985' u'Africa' u"Cte d'Ivoire" u'Wine' u'1.62'] ..., [u'1987' u'Africa' u'Malawi' u'Other' u'0.75'] [u'1989' u'Americas' u'Bahamas' u'Wine' u'1.5'] [u'1985' u'Africa' u'Malawi' u'Spirits' u'0.31']] 複製程式碼
-
建立一維和二維的Array陣列
#The numpy.array() function can take a list or list of lists as input. When we input a list, we get a one-dimensional array as a result: #一維的Array陣列[] vector = numpy.array([5, 10, 15, 20]) #二維的Array陣列[[],[],[]] matrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]]) print vector print matrix 複製程式碼
-
shape用法
#We can use the ndarray.shape property to figure out how many elements are in the array vector = numpy.array([1, 2, 3, 4]) print(vector.shape) #For matrices, the shape property contains a tuple with 2 elements. matrix = numpy.array([[5, 10, 15], [20, 25, 30]]) print(matrix.shape) (4,) (2, 3) 複製程式碼
-
dtype用法(numpy要求numpy.array內部元素結構相同)
numbers = numpy.array([1, 2, 3, 4]) numbers.dtype dtype('int32') #改變其中一個值時,其他值都會改變 numbers = numpy.array([1, 2, 3, '4']) print(numbers) numbers.dtype ['1' '2' '3' '4'] dtype('<U11') 複製程式碼
-
索引定位
[[u'1986' u'Western Pacific' u'Viet Nam' u'Wine' u'0'] [u'1986' u'Americas' u'Uruguay' u'Other' u'0.5'] [u'1985' u'Africa' u"Cte d'Ivoire" u'Wine' u'1.62'] ..., [u'1987' u'Africa' u'Malawi' u'Other' u'0.75'] [u'1989' u'Americas' u'Bahamas' u'Wine' u'1.5'] [u'1985' u'Africa' u'Malawi' u'Spirits' u'0.31']] uruguay_other_1986 = world_alcohol[1,4] third_country = world_alcohol[2,2] print uruguay_other_1986 print third_country 0.5 Cte d'Ivoire 複製程式碼
-
索引切片
vector = numpy.array([5, 10, 15, 20]) print(vector[0:3]) [ 5 10 15] 複製程式碼
-
取某一列(:表示所有行)
matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) print(matrix[:,1]) [10 25 40] matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) print(matrix[:,0:2]) [[ 5 10] [20 25] [35 40]] matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) print(matrix[1:3,0:2]) [[20 25] [35 40]] 複製程式碼
-
對Array操作表示對內部所有元素進行操作
import numpy #it will compare the second value to each element in the vector # If the values are equal, the Python interpreter returns True; otherwise, it returns False vector = numpy.array([5, 10, 15, 20]) vector == 10 array([False, True, False, False], dtype=bool) matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) matrix == 25 array([[False, False, False], [False, True, False], [False, False, False]], dtype=bool) 複製程式碼
-
布林值當索引([False True False False])
vector = numpy.array([5, 10, 15, 20]) equal_to_ten = (vector == 10) print equal_to_ten print(vector[equal_to_ten]) [False True False False] [10] #矩陣表示索引 matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) second_column_25 = (matrix[:,1] == 25) print second_column_25 print(matrix[second_column_25, :]) [False True False] [[20 25 30]] 複製程式碼
-
對陣列進行與運算
#We can also perform comparisons with multiple conditions vector = numpy.array([5, 10, 15, 20]) equal_to_ten_and_five = (vector == 10) & (vector == 5) print equal_to_ten_and_five [False False False False] vector = numpy.array([5, 10, 15, 20]) equal_to_ten_or_five = (vector == 10) | (vector == 5) print equal_to_ten_or_five [ True True False False] 複製程式碼
-
值型別轉換
vector = numpy.array(["1", "2", "3"]) print vector.dtype print vector vector = vector.astype(float) print vector.dtype print vector |S1 ['1' '2' '3'] float64 [ 1. 2. 3.] 複製程式碼
-
聚合求解
vector = numpy.array([5, 10, 15, 20]) vector.sum() 複製程式碼
-
按行維度(axis=1)
matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) matrix.sum(axis=1) array([ 30, 75, 120]) 複製程式碼
-
按列求和(axis=0)
matrix = numpy.array([ [5, 10, 15], [20, 25, 30], [35, 40, 45] ]) matrix.sum(axis=0) array([60, 75, 90]) 複製程式碼
-
矩陣操作np.arange生成0-N的整數
import numpy as np a = np.arange(15).reshape(3, 5) a array([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]) a.ndim 2 a.dtype.name 'int32' a.size 15 複製程式碼
-
矩陣初始化
np.zeros ((3,4)) array([[ 0., 0., 0., 0.], [ 0., 0., 0., 0.], [ 0., 0., 0., 0.]]) np.ones( (2,3,4), dtype=np.int32 ) array([[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]) 複製程式碼
-
按照間隔生成資料
np.arange( 10, 30, 5 ) array([10, 15, 20, 25]) np.arange( 0, 2, 0.3 ) array([ 0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8]) 複製程式碼
-
隨機生成資料
np.random.random((2,3)) array([[ 0.40130659, 0.45452825, 0.79776512], [ 0.63220592, 0.74591134, 0.64130737]]) 複製程式碼
-
linspace在0到2pi之間取100個數
from numpy import pi np.linspace( 0, 2*pi, 100 ) array([ 0. , 0.06346652, 0.12693304, 0.19039955, 0.25386607, 0.31733259, 0.38079911, 0.44426563, 0.50773215, 0.57119866, 0.63466518, 0.6981317 , 0.76159822, 0.82506474, 0.88853126, 0.95199777, 1.01546429, 1.07893081, 1.14239733, 1.20586385, 1.26933037, 1.33279688, 1.3962634 , 1.45972992, 1.52319644, 1.58666296, 1.65012947, 1.71359599, 1.77706251, 1.84052903, 1.90399555, 1.96746207, 2.03092858, 2.0943951 , 2.15786162, 2.22132814, 2.28479466, 2.34826118, 2.41172769, 2.47519421, 2.53866073, 2.60212725, 2.66559377, 2.72906028, 2.7925268 , 2.85599332, 2.91945984, 2.98292636, 3.04639288, 3.10985939, 3.17332591, 3.23679243, 3.30025895, 3.36372547, 3.42719199, 3.4906585 , 3.55412502, 3.61759154, 3.68105806, 3.74452458, 3.8079911 , 3.87145761, 3.93492413, 3.99839065, 4.06185717, 4.12532369, 4.1887902 , 4.25225672, 4.31572324, 4.37918976, 4.44265628, 4.5061228 , 4.56958931, 4.63305583, 4.69652235, 4.75998887, 4.82345539, 4.88692191, 4.95038842, 5.01385494, 5.07732146, 5.14078798, 5.2042545 , 5.26772102, 5.33118753, 5.39465405, 5.45812057, 5.52158709, 5.58505361, 5.64852012, 5.71198664, 5.77545316, 5.83891968, 5.9023862 , 5.96585272, 6.02931923, 6.09278575, 6.15625227, 6.21971879, 6.28318531]) 複製程式碼
-
矩陣基本操作
#the product operator * operates elementwise in NumPy arrays a = np.array( [20,30,40,50] ) b = np.arange( 4 ) print (a) print (b) #b c = a-b print (c) b**2 print (b**2) print (a<35) [20 30 40 50] [0 1 2 3] [20 29 38 47] [ True True False False] 複製程式碼
-
矩陣相乘
#The matrix product can be performed using the dot function or method A = np.array([[1,1], [0,1]] ) B = np.array([[2,0], [3,4]]) print (A) print (B) print (A*B) print (A.dot(B)) print (np.dot(A, B) ) [[1 1] [0 1]] [[2 0] [3 4]] [[2 0] [0 4]] [[5 4] [3 4]] [[5 4] [3 4]] 複製程式碼
-
矩陣操作floor向下取整
import numpy as np B = np.arange(3) print (B) #print np.exp(B) print (np.sqrt(B)) [0 1 2] [0. 1. 1.41421356] #Return the floor of the input a = np.floor(10*np.random.random((3,4))) #print a #Return the floor of the input a = np.floor(10*np.random.random((3,4))) print (a) print(a.reshape(2,-1)) [[0. 4. 2. 2.] [8. 1. 5. 7.] [0. 9. 7. 4.]] [[0. 4. 2. 2. 8. 1.] [5. 7. 0. 9. 7. 4.]] 複製程式碼
-
hstack矩陣拼接
a = np.floor(10*np.random.random((2,2))) b = np.floor(10*np.random.random((2,2))) print a print '---' print b print '---' print np.hstack((a,b)) [[ 5. 6.] [ 1. 5.]] --- [[ 8. 6.] [ 9. 0.]] --- [[ 5. 6. 8. 6.] [ 1. 5. 9. 0.]] a = np.floor(10*np.random.random((2,2))) b = np.floor(10*np.random.random((2,2))) print (a) print ('---') print (b) print ('---') #print np.hstack((a,b)) np.vstack((a,b)) [[7. 7.] [2. 6.]] --- [[0. 6.] [0. 3.]] --- array([[1., 0.], [3., 6.], [4., 2.], [8., 7.]]) a = np.floor(10*np.random.random((2,12))) print (a) print (np.hsplit(a,3)) [[6. 5. 2. 4. 2. 4. 9. 4. 4. 6. 8. 9.] [8. 4. 0. 2. 6. 5. 2. 5. 0. 4. 1. 6.]] [array([[6., 5., 2., 4.], [8., 4., 0., 2.]]), array([[2., 4., 9., 4.], [6., 5., 2., 5.]]), array([[4., 6., 8., 9.], [0., 4., 1., 6.]])] 複製程式碼
-
任意選擇切分位置
print ( np.hsplit(a,(3,4))) # Split a after the third and the fourth column [[2. 8. 4. 7. 6. 6. 5. 8. 8. 3. 0. 1.] [3. 5. 9. 4. 5. 8. 7. 6. 2. 3. 8. 4.]] [array([[2., 8., 4.], [3., 5., 9.]]), array([[7.], [4.]]), array([[6., 6., 5., 8., 8., 3., 0., 1.], [5., 8., 7., 6., 2., 3., 8., 4.]])] 複製程式碼
-
變數賦值
-
變數檢視
-
copy實現變數之間沒有關係
d = a.copy() d is a d[0,0] = 9999 print d print a [[9999 1 2 3] [1234 5 6 7] [ 8 9 10 11]] [[ 0 1 2 3] [1234 5 6 7] [ 8 9 10 11]] 複製程式碼
-
尋找列最大值索引
-
行列按照倍數擴充套件(行3倍列5倍)
a = np.arange(0, 40, 10) b = np.tile(a, (3, 5)) print b [[ 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30] [ 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30] [ 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30]] 複製程式碼
-
按照元素大小排序並給出索引值
a = np.array([4, 3, 1, 2]) j = np.argsort(a) print j print a[j] [2 3 1 0] [1 2 3 4] 複製程式碼
-
對陣列按照元素大小排序
a = np.array([[4, 3, 5], [1, 2, 1]]) #print a b = np.sort(a, axis=1) print (b) [[3 4 5] [1 1 2]] 複製程式碼
2 Pandas詳細使用(底層基於Numpy)
2.1 Pandas基本操作
- Pandas核心結構(DataFrame)
- Pandas 字元型表示為Object
- Pandas資料基本型別展示
import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))
<class 'pandas.core.frame.DataFrame'>
col_names = food_info.columns.tolist()
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)',
'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)',
'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)',
'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE',
'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)',
'FA_Poly_(g)', 'Cholestrl_(mg)']
print food_info.dtypes
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
複製程式碼
-
Pandas基本操作
#可以指定數量 #first_rows = food_info.head() #print(food_info.head(3)) 複製程式碼
#print food_info.columns
複製程式碼
#print food_info.shape
(8618,36)
複製程式碼
-
取資料操作
#pandas uses zero-indexing #Series object representing the row at index 0. #print food_info.loc[0] # Series object representing the seventh row. #food_info.loc[6] # Will throw an error: "KeyError: 'the label [8620] is not in the [index]'" #food_info.loc[8620] #The object dtype is equivalent to a string in Python 複製程式碼
-
資料切片
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6. #food_info.loc[3:6] # Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work. # Method 1 #two_five_ten = [2,5,10] #food_info.loc[two_five_ten] # Method 2 #food_info.loc[[2,5,10]] 複製程式碼
-
通過列名取出資料
# Series object representing the "NDB_No" column. #ndb_col = food_info["NDB_No"] #print ndb_col # Alternatively, you can access a column by passing in a string variable. #col_name = "NDB_No" #ndb_col = food_info[col_name] 複製程式碼
-
取出兩個列的值
#columns = ["Zinc_(mg)", "Copper_(mg)"] #zinc_copper = food_info[columns] #print zinc_copper #print zinc_copper # Skipping the assignment. #zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]] 複製程式碼
-
endswith 定位取值
#print(food_info.columns) #print(food_info.head(2)) col_names = food_info.columns.tolist() #print col_names gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3)) 複製程式碼
2.2 Series型別上場
Series 是一個帶有 名稱 和索引的一維陣列,既然是陣列,肯定要說到的就是陣列中的元素型別,在 Series 中包含的資料型別可以是整數、浮點、字串、Python物件等。
# 儲存了 4 個年齡:18/30/25/40
user_age = pd.Series(data=[18, 30, 25, 40])
user_age
0 18
1 30
2 25
3 40
dtype: int64
複製程式碼
-
指定索引
user_age.index = ["Tom", "Bob", "Mary", "James"] user_age Tom 18 Bob 30 Mary 25 James 40 dtype: int64 複製程式碼
-
為 index 起個名字
user_age.index.name = "name" user_age name Tom 18 Bob 30 Mary 25 James 40 dtype: int64 複製程式碼
-
給 Series 起個名字
user_age.name="user_age_info" user_age name Tom 18 Bob 30 Mary 25 James 40 Name: user_age_info, dtype: int64 複製程式碼
-
一個 Series 包括了 data、index 以及 name。
# 構建索引 name = pd.Index(["Tom", "Bob", "Mary", "James"], name="name") # 構建 Series user_age = pd.Series(data=[18, 30, 25, 40], index=name, name="user_age_info") user_age name Tom 18 Bob 30 Mary 25 James 40 Name: user_age_info, dtype: int64 # 指定型別為浮點型 user_age = pd.Series(data=[18, 30, 25, 40], index=name, name="user_age_info", dtype=float) user_age name Tom 18.0 Bob 30.0 Mary 25.0 James 40.0 Name: user_age_info, dtype: float64 複製程式碼
-
Series 包含了 dict 的特點,也就意味著可以使用與 dict 類似的一些操作。我們可以將 index 中的元素看成是 dict 中的 key。
# 獲取 Tom 的年齡 user_age["Tom"] 18.0 user_age.get("Tom") 18.0 # 指定索引,獲取第一個元素 user_age[0] 18.0 # 獲取前三個元素 user_age[:3] name Tom 18.0 Bob 30.0 Mary 25.0 Name: user_age_info, dtype: float64 # 獲取年齡大於30的元素 user_age[user_age > 30] name James 40.0 Name: user_age_info, dtype: float64 # 獲取第4個和第二個元素 user_age[[3, 1]] name James 40.0 Bob 30.0 Name: user_age_info, dtype: float64 複製程式碼
2.3 DataFrame隆重登場
-
DataFrame 是一個帶有索引的二維資料結構,每列可以有自己的名字,並且可以有不同的資料型別。你可以把它想象成一個 excel 表格或者資料庫中的一張表,DataFrame 是最常用的 Pandas 物件。
index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name") data = { "age": [18, 30, 25, 40], "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"] } user_info = pd.DataFrame(data=data, index=index) user_info 複製程式碼
-
通過索引名來訪問某行,這種辦法需要藉助 loc 方法
user_info.loc["Tom"] age 18 city BeiJing Name: Tom, dtype: object 複製程式碼
-
通過這行所在的位置來選擇這一行
user_info.iloc[0] age 18 city BeiJing Name: Tom, dtype: object 複製程式碼
-
如何訪問多行
user_info.iloc[1:3] 複製程式碼
-
訪問列
user_info.age name Tom 18 Bob 30 Mary 25 James 40 Name: age, dtype: int64 user_info["age"] name Tom 18 Bob 30 Mary 25 James 40 Name: age, dtype: int64 #可以變換列的順序 user_info[["city", "age"]] 複製程式碼
2.4 DataFrame資料處理操作
-
info 函式(型別和缺失值統計)
user_info.info() Index: 4 entries, Tom to James Data columns (total 3 columns): age 4 non-null int64 city 4 non-null object sex 4 non-null object dtypes: int64(1), object(2) memory usage: 128.0+ bytes user_info.head(2) user_info.shape (4, 3) user_info.T 複製程式碼
-
通過 DataFrame 來獲取它包含的原有資料
user_info.values array([[18, 'BeiJing', 'male'], [30, 'ShangHai', 'male'], [25, 'GuangZhou', 'female'], [40, 'ShenZhen', 'male']], dtype=object) 複製程式碼
-
統計
user_info.age.max() 複製程式碼
-
累加求和
user_info.age.cumsum() name Tom 18 Bob 48 Mary 73 James 113 Name: age, dtype: int64 user_info.sex.cumsum() name Tom male Bob malemale Mary malemalefemale James malemalefemalemale Name: sex, dtype: object 複製程式碼
-
統計指標彙總(總數、平均數、標準差、最小值、最大值、25%/50%/75% 分位數)
user_info.describe() 複製程式碼
user_info.describe(include=["object"])
複製程式碼
-
統計下某列中每個值出現的次數
user_info.sex.value_counts() male 3 female 1 Name: sex, dtype: int64 複製程式碼
-
獲取某列最大值或最小值對應的索引
user_info.age.idxmax() 'James' 複製程式碼
-
離散化(分桶)
pd.cut(user_info.age, 3) name Tom (17.978, 25.333] Bob (25.333, 32.667] Mary (17.978, 25.333] James (32.667, 40.0] Name: age, dtype: category Categories (3, interval[float64]): [(17.978, 25.333] < (25.333, 32.667] < (32.667, 40.0]] 複製程式碼
-
自定義分桶
pd.cut(user_info.age, [1, 18, 30, 50]) name Tom (1, 18] Bob (18, 30] Mary (18, 30] James (30, 50] Name: age, dtype: category Categories (3, interval[int64]): [(1, 18] < (18, 30] < (30, 50]] 複製程式碼
-
離散化之後,給每個區間起個名字
pd.cut(user_info.age, [1, 18, 30, 50], labels=["childhood", "youth", "middle"]) name Tom childhood Bob youth Mary youth James middle Name: age, dtype: category Categories (3, object): [childhood < youth < middle] 複製程式碼
-
按照索引進行正序排的
user_info.sort_index() 複製程式碼
-
按照列進行倒序排,可以設定引數 axis=1 和 ascending=False。
user_info.sort_index(axis=1, ascending=False) 複製程式碼
-
按照實際值來排序
user_info.sort_values(by="age") 複製程式碼
user_info.sort_values(by=["age", "city"])
複製程式碼
-
獲取最大的n個值或最小值的n個值
user_info.age.nlargest(2) name James 40 Bob 30 Name: age, dtype: int64 複製程式碼
-
函式應用map
user_info.age.map(lambda x: "yes" if x >= 30 else "no") name Tom no Bob yes Mary no James yes Name: age, dtype: object city_map = { "BeiJing": "north", "ShangHai": "south", "GuangZhou": "south", "ShenZhen": "south" } # 傳入一個 map user_info.city.map(city_map) name Tom north Bob south Mary south James south Name: city, dtype: object 複製程式碼
-
函式應用apply
# 對 Series 來說,apply 方法 與 map 方法區別不大。 user_info.age.apply(lambda x: "yes" if x >= 30 else "no") name Tom no Bob yes Mary no James yes Name: age, dtype: object # 對 DataFrame 來說,apply 方法的作用物件是一行或一列資料(一個Series) user_info.apply(lambda x: x.max(), axis=0) age 40 city ShenZhen sex male dtype: object 複製程式碼
-
作用於 DataFrame 中的每個元素applymap
user_info.applymap(lambda x: str(x).lower()) 複製程式碼
-
新增新列
user_info["height"] = ["178", "168", "178", "180cm"] user_info 複製程式碼
-
型別轉換
預設情況下,errors='raise',這意味著強轉失敗後直接丟擲異常,設定 errors='coerce' 可以在強轉失敗時將有問題的元素賦值為 pd.NaT(對於datetime和timedelta)或 np.nan(數字)。設定 errors='ignore' 可以在強轉失敗時返回原有的資料。 pd.to_numeric(user_info.height, errors="coerce") name Tom 178.0 Bob 168.0 Mary 178.0 James NaN Name: height, dtype: float64 pd.to_numeric(user_info.height, errors="ignore") name Tom 178 Bob 168 Mary 178 James 180cm Name: height, dtype: object 複製程式碼
2.5 缺失值處理
待補充
2.6 Pandas案例實戰
2.6.1 案例實戰1
import pandas
food_info = pandas.read_csv("C:\\ML\\MLData\\food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
複製程式碼
針對某一列進行四則運算
#print food_info["Iron_(mg)"]
#div_1000 = food_info["Iron_(mg)"] / 1000
#print div_1000
# Adds 100 to each value in the column and returns a Series object.
#add_100 = food_info["Iron_(mg)"] + 100
# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100
# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams
#追加新列
max_calories = food_info["Energ_Kcal"].max()
print(max_calories)
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
#排序
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
#print (food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print (food_info["Sodium_(mg)"])
複製程式碼
2.6.2 泰坦尼克案例實戰2
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("C:\\ML\\MLData\\titanic_train.csv")
#SibSp:老人和孩子
#Parch:家人
#Pclass:倉位級別
#Cabin:船艙編號,NaN是缺失值(就是為空的值)
#Embarked 登船地點 S C Q 三個碼頭
titanic_survival.head()
複製程式碼
-
控制空值判斷及展示
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value. #we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values age = titanic_survival["Age"] #print(age.loc[0:10]) age_is_null = pd.isnull(age) #print (age_is_null) age_null_true = age[age_is_null] print (age_null_true) age_null_count = len(age_null_true) #print(age_null_count) 5 NaN 17 NaN 19 NaN 26 NaN 28 NaN 29 NaN 31 NaN 32 NaN 36 NaN 42 NaN 45 NaN 46 NaN 複製程式碼
-
含有空值時將無法計算
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) print (mean_age) nan #過濾出非空值,但是略顯複雜 #we have to filter out the missing values before we calculate the mean. good_ages = titanic_survival["Age"][age_is_null == False] #print good_ages correct_mean_age = sum(good_ages) / len(good_ages) print (correct_mean_age) 29.69911764705882 # missing data is so common that many pandas methods automatically filter for it correct_mean_age = titanic_survival["Age"].mean() print correct_mean_age 29.6991176471 #mean fare for each class 複製程式碼
-
每個船艙位的平均價格
passenger_classes = [1, 2, 3] fares_by_class = {} for this_class in passenger_classes: pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] pclass_fares = pclass_rows["Fare"] fare_for_class = pclass_fares.mean() fares_by_class[this_class] = fare_for_class print (fares_by_class) {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997} # pandas不同列之間的關係,pivot_table高階用法 passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) print (passenger_survival) Survived Pclass 1 0.629630 2 0.472826 3 0.242363 # 預設求均值 passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age") print(passenger_age) Pclass 1 38.233441 2 29.877630 3 25.140620 Name: Age, dtype: float64 #多列之間關係 port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum) print(port_stats) Fare Survived Embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217 複製程式碼
-
丟掉缺失值 axis=1 表示行
#specifying axis=1 or axis='columns' will drop any columns that have null values drop_na_columns = titanic_survival.dropna(axis=1) new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"]) #print new_titanic_survival 複製程式碼
-
通過索引和列名
row_index_83_age = titanic_survival.loc[83,"Age"] row_index_1000_pclass = titanic_survival.loc[766,"Pclass"] print (row_index_83_age) print (row_index_1000_pclass) 28.0 1 複製程式碼
-
按值排序(索引不變),並重設索引
#按值排序(索引不變) new_titanic_survival = titanic_survival.sort_values("Age",ascending=False) #print (new_titanic_survival[0:10]) #索引發生變化 itanic_reindexed = new_titanic_survival.reset_index(drop=True) print(itanic_reindexed.iloc[0:10]) 複製程式碼
-
返回第100行資料
# This function returns the hundredth item from a series def hundredth_row(column): # Extract the hundredth item hundredth_item = column.iloc[99] return hundredth_item # Return the hundredth item from each column hundredth_row = titanic_survival.apply(hundredth_row) print (hundredth_row) PassengerId 100 Survived 0 Pclass 2 Name Kantor, Mr. Sinai Sex male Age 34 SibSp 1 Parch 0 Ticket 244367 Fare 26 Cabin NaN Embarked S dtype: object 複製程式碼
-
自定義行數非空判斷
def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) print (column_null_count) PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 複製程式碼
-
定義級別axis=1表示行
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns. def which_class(row): pclass = row['Pclass'] if pd.isnull(pclass): return "Unknown" elif pclass == 1: return "First Class" elif pclass == 2: return "Second Class" elif pclass == 3: return "Third Class" classes = titanic_survival.apply(which_class, axis=1) print (classes) 0 Third Class 1 First Class 2 Third Class 3 First Class 4 Third Class 5 Third Class 6 First Class 7 Third Class 複製程式碼
-
自定義函式
def is_minor(row): if row["Age"] < 18: return True else: return False minors = titanic_survival.apply(is_minor, axis=1) #print minors def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label, axis=1) print age_labels 0 adult 1 adult 2 adult 3 adult 4 adult 5 unknown 6 adult 7 minor 8 adult 9 minor 10 minor 11 adult 12 adult 複製程式碼
-
分類類別
titanic_survival['age_labels'] = age_labels age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived") print age_group_survival age_labels adult 0.381032 minor 0.539823 unknown 0.293785 Name: Survived, dtype: float64 複製程式碼
2.6.3 Series 案例實戰3
#Pandas預設其中一行或一列就是Series
import pandas as pds
fandango = pd.read_csv('C:\\ML\\MLData\\fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
0 Avengers: Age of Ultron (2015)
1 Cinderella (2015)
2 Ant-Man (2015)
3 Do You Believe? (2015)
4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0 74
1 85
2 80
3 18
4 14
Name: RottenTomatoes, dtype: int64
複製程式碼
-
設定索引列,根據索引取得對應的值
# Import the Series object from pandas from pandas import Series film_names = series_film.values #print (type(film_names)) #print (film_names) rt_scores = series_rt.values #print (rt_scores) series_custom = Series(rt_scores , index=film_names) #根據索引取得對應的值 series_custom[['Minions (2015)', 'Leviathan (2014)']] Minions (2015) 54 Leviathan (2014) 99 dtype: int64 複製程式碼
-
範圍查詢
series_custom = Series(rt_scores , index=film_names) #series_custom[['Minions (2015)', 'Leviathan (2014)']] fiveten = series_custom[5:10] print(fiveten) The Water Diviner (2015) 63 Irrational Man (2015) 42 Top Five (2014) 86 Shaun the Sheep Movie (2015) 99 Love & Mercy (2015) 89 dtype: int64 複製程式碼
-
索引排序,並重設
original_index = series_custom.index.tolist() #print original_index sorted_index = sorted(original_index) sorted_by_index = series_custom.reindex(sorted_index) #print sorted_by_index '71 (2015) 97 5 Flights Up (2015) 52 A Little Chaos (2015) 40 A Most Violent Year (2014) 90 About Elly (2015) 97 Aloha (2015) 19 American Sniper (2015) 72 複製程式碼
-
Series 值排序
sc2 = series_custom.sort_index() sc3 = series_custom.sort_values() #print(sc2[0:10]) print(sc3[0:10]) Paul Blart: Mall Cop 2 (2015) 5 Hitman: Agent 47 (2015) 7 Hot Pursuit (2015) 8 Fantastic Four (2015) 9 Taken 3 (2015) 9 The Boy Next Door (2015) 10 The Loft (2015) 11 Unfinished Business (2015) 11 Mortdecai (2015) 12 Seventh Son (2015) 12 複製程式碼
-
Series對應索引相加
#The values in a Series object are treated as an ndarray, the core data type in NumPy import numpy as np # Add each value with each other print np.add(series_custom, series_custom) # Apply sine function to each value np.sin(series_custom) # Return the highest value (will return a single value not a Series) np.max(series_custom) 複製程式碼
-
Series對應Lambda表示式(求標準差)
#The apply() method in Pandas allows us to specify Python logic #The apply() method requires you to pass in a vectorized operation #that can be applied over each Series object. import numpy as np # returns the data types as a Series types = fandango_films.dtypes #print types # filter data types to just floats, index attributes returns just column names float_columns = types[types.values == 'float64'].index # use bracket notation to filter columns to just float columns float_df = fandango_films[float_columns] #print float_df # `x` is a Series object representing a column deviations = float_df.apply(lambda x: np.std(x)) print(deviations) Metacritic_User 1.505529 IMDB 0.955447 Fandango_Stars 0.538532 Fandango_Ratingvalue 0.501106 RT_norm 1.503265 RT_user_norm 0.997787 Metacritic_norm 0.972522 Metacritic_user_nom 0.752765 IMDB_norm 0.477723 複製程式碼
-
對應兩列通過Lambda表示式求標準差
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']] rt_mt_user.apply(lambda x: np.std(x), axis=1) FILM Avengers: Age of Ultron (2015) 0.375 Cinderella (2015) 0.125 Ant-Man (2015) 0.225 Do You Believe? (2015) 0.925 Hot Tub Time Machine 2 (2015) 0.150 The Water Diviner (2015) 0.150 Irrational Man (2015) 0.575 Top Five (2014) 0.100 Shaun the Sheep Movie (2015) 0.150 Love & Mercy (2015) 0.050 Far From The Madding Crowd (2015) 0.050 Black Sea (2015) 0.150 複製程式碼
3 matplotlib使用實踐
-
折線圖
import pandas as pd unrate = pd.read_csv("C:\\ML\\MLData\\unrate.csv") unrate['DATE'] = pd.to_datetime(unrate['DATE']) print(unrate.head(12)) DATE VALUE 0 1948-01-01 3.4 1 1948-02-01 3.8 2 1948-03-01 4.0 3 1948-04-01 3.9 4 1948-05-01 3.5 5 1948-06-01 3.6 6 1948-07-01 3.6 7 1948-08-01 3.9 8 1948-09-01 3.8 9 1948-10-01 3.7 10 1948-11-01 3.8 11 1948-12-01 4.0 import matplotlib.pyplot as plt plt.plot() plt.show() 複製程式碼
first_twelve = unrate[0:12]
plt.plot(first_twelve['DATE'], first_twelve['VALUE'])
plt.show()
複製程式碼
plt.plot(first_twelve['DATE'], first_twelve['VALUE'])
plt.xticks(rotation=45)
#print help(plt.xticks)
plt.show()
複製程式碼
#xlabel(): accepts a string value, which gets set as the x-axis label.
#ylabel(): accepts a string value, which is set as the y-axis label.
#title(): accepts a string value, which is set as the plot title.
plt.plot(first_twelve['DATE'], first_twelve['VALUE'])
plt.xticks(rotation=90)
plt.xlabel('Month')
plt.ylabel('Unemployment Rate')
plt.title('Monthly Unemployment Trends, 1948')
plt.show()
複製程式碼
-
多條折線圖展示
fig = plt.figure(figsize=(10,6)) colors = ['red', 'blue', 'green', 'orange', 'black'] for i in range(5): start_index = i*12 end_index = (i+1)*12 subset = unrate[start_index:end_index] label = str(1948 + i) plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i], label=label) plt.legend(loc='upper left') plt.xlabel('Month, Integer') plt.ylabel('Unemployment Rate, Percent') plt.title('Monthly Unemployment Trends, 1948-1952') plt.show() 複製程式碼
-
柱狀圖豎型展示
import pandas as pd reviews = pd.read_csv('C:\\ML\\MLData\\fandango_scores.csv') cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars'] norm_reviews = reviews[cols] print(type(reviews)) #列印出第一行 print(norm_reviews[:1]) <class 'pandas.core.frame.DataFrame'> 複製程式碼
import matplotlib.pyplot as plt
from numpy import arange
#取出第一行指定列num_cols的資料
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.loc[0, num_cols].values
print(bar_heights)
[4.3 3.55 3.9 4.5 5.0]
bar_heights = norm_reviews.loc[0, num_cols].values
#橫軸位置
bar_positions = arange(5) + 0.75
#橫軸標識的位置(1到6之間)
tick_positions = range(1,6)
fig, ax = plt.subplots()
#0.5標識柱狀圖寬度
ax.bar(bar_positions, bar_heights, 0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols, rotation=90)
ax.set_xlabel('Rating Source')
ax.set_ylabel('Average Rating')
ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')
plt.show()
複製程式碼
-
柱狀圖橫向表示
import matplotlib.pyplot as plt from numpy import arange num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars'] bar_widths = norm_reviews.loc[0, num_cols].values bar_positions = arange(5) + 0.75 #橫軸標識名的位置(1到6之間) tick_positions = range(1,6) fig, ax = plt.subplots() ax.barh(bar_positions, bar_widths, 0.6) ax.set_yticks(tick_positions) ax.set_yticklabels(num_cols) ax.set_ylabel('Rating Source') ax.set_xlabel('Average Rating') ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)') plt.show() 複製程式碼
-
散點圖
#Switching Axes fig = plt.figure(figsize=(5,10)) ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2) ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm']) ax1.set_xlabel('Fandango') ax1.set_ylabel('Rotten Tomatoes') ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue']) ax2.set_xlabel('Rotten Tomatoes') ax2.set_ylabel('Fandango') plt.show() 複製程式碼
- Hist的bins區間統計
import pandas as pd
import matplotlib.pyplot as plt
reviews = pd.read_csv('C:\\ML\\MLData\\fandango_scores.csv')
cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
norm_reviews = reviews[cols]
print(norm_reviews[:5])
#按照列進行分組聚合
fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()
fandango_distribution = fandango_distribution.sort_index()
#按照列進行分組聚合
imdb_distribution = norm_reviews['IMDB_norm'].value_counts()
imdb_distribution = imdb_distribution.sort_index()
print(fandango_distribution)
2.7 2
2.8 2
2.9 5
3.0 4
3.1 3
3.2 5
3.3 4
3.4 9
3.5 9
3.6 8
3.7 9
3.8 5
3.9 12
4.0 7
4.1 16
4.2 12
4.3 11
4.4 7
4.5 9
4.6 4
4.8 3
Name: Fandango_Ratingvalue, dtype: int64
print(imdb_distribution)
2.00 1
2.10 1
2.15 1
2.20 1
2.30 2
2.45 2
2.50 1
2.55 1
2.60 2
2.70 4
2.75 5
2.80 2
2.85 1
2.90 1
2.95 3
3.00 2
3.05 4
3.10 1
3.15 9
3.20 6
3.25 4
3.30 9
3.35 7
3.40 1
3.45 7
3.50 4
3.55 7
3.60 10
3.65 5
3.70 8
3.75 6
3.80 3
3.85 4
3.90 9
3.95 2
4.00 1
4.05 1
4.10 4
4.15 1
4.20 2
4.30 1
Name: IMDB_norm, dtype: int64
fig, ax = plt.subplots()
#ax.hist(norm_reviews['Fandango_Ratingvalue'])
#ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)
指定區間為20個,範圍為4到5
ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)
plt.show()
複製程式碼
-
4分圖盒圖
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue'] fig, ax = plt.subplots() #指定統計列取出對應值 ax.boxplot(norm_reviews[num_cols].values) ax.set_xticklabels(num_cols, rotation=90) ax.set_ylim(0,5) plt.show() 複製程式碼
4 Seaborn專業視覺化庫(基於matplot)
-
風格設定
import seaborn as sns import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline sns.set_style("whitegrid") data = np.random.normal(size=(20, 6)) + np.arange(6) / 2 sns.boxplot(data=data) 複製程式碼
sns.set_style("dark")
sinplot()
sns.set_style("white")
sinplot()
sns.set_style("whitegrid")
sns.boxplot(data=data, palette="deep")
sns.despine(left=True)
複製程式碼
-
調色盤設定
import numpy as np import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline sns.set(rc={"figure.figsize": (6, 6)}) current_palette = sns.color_palette() sns.palplot(current_palette) 複製程式碼
6個預設的顏色迴圈主題: deep, muted, pastel, bright, dark, colorblind
sns.palplot(sns.color_palette("hls", 8))
複製程式碼
data = np.random.normal(size=(20, 8)) + np.arange(8) / 2
sns.boxplot(data=data,palette=sns.color_palette("hls", 8))
data = np.random.normal(size=(20, 8)) + np.arange(8) / 2
#print(data)
sns.boxplot(data=data,palette=sns.color_palette("hls", 8))
複製程式碼
-
區間直方圖繪製(kde是否指定核密度估計)
x = np.random.gamma(6, size=200) sns.distplot(x, kde=False, fit=stats.gamma) 複製程式碼
-
線性迴歸1
%matplotlib inline import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) np.random.seed(sum(map(ord, "regression"))) tips = sns.load_dataset("tips") tips.head() 複製程式碼
sns.regplot(x="total_bill", y="tip", data=tips)
複製程式碼
-
線性迴歸2
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips); 複製程式碼
-
多分類問題
%matplotlib inline import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns sns.set(style="whitegrid", color_codes=True) np.random.seed(sum(map(ord, "categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") sns.stripplot(x="day", y="total_bill", data=tips); 複製程式碼
sns.stripplot(x="day", y="total_bill", data=tips, jitter=True)
複製程式碼
-
樹樁展示均勻展示
sns.swarmplot(x="day", y="total_bill", data=tips) 複製程式碼
-
樹樁展示均勻並分類展示
sns.swarmplot(x="day", y="total_bill", hue="sex",data=tips) 複製程式碼
-
盒圖
IQR即統計學概念四分位距,第一/四分位與第三/四分位之間的距離 N = 1.5IQR 如果一個值>Q3+N或 < Q1-N,則為離群點 #橫槓最小值和最大值 sns.boxplot(x="day", y="total_bill", hue="time", data=tips); 複製程式碼
-
小提琴圖(越胖包含的資料越多)
sns.violinplot(x="day", y="total_bill", hue="sex", data=tips, split=True); 複製程式碼
-
葫蘆圖
sns.violinplot(x="day", y="total_bill", data=tips, inner=None) sns.swarmplot(x="day", y="total_bill", data=tips, color="w", alpha=.5) 複製程式碼
-
柱狀分類統計圖
sns.barplot(x="sex", y="survived", hue="class", data=titanic); 複製程式碼
-
點圖可以更好的描述變化差異
sns.pointplot(x="sex", y="survived", hue="class", data=titanic); 複製程式碼
sns.pointplot(x="class", y="survived", hue="sex", data=titanic,
palette={"male": "g", "female": "m"},
markers=["^", "o"], linestyles=["-", "--"]);
複製程式碼
-
多層皮膚分類圖
sns.factorplot(x="day", y="total_bill", hue="smoker", data=tips) 複製程式碼
sns.factorplot(x="day", y="total_bill", hue="smoker", data=tips, kind="bar")
複製程式碼
sns.factorplot(x="day", y="total_bill", hue="smoker",
col="time", data=tips, kind="swarm")
複製程式碼
sns.factorplot(x="time", y="total_bill", hue="smoker",
col="day", data=tips, kind="box", size=4, aspect=.5)
複製程式碼
-
FacetGrid 多引數網格皮膚
%matplotlib inline import numpy as np import pandas as pd import seaborn as sns from scipy import stats import matplotlib as mpl import matplotlib.pyplot as plt sns.set(style="ticks") np.random.seed(sum(map(ord, "axis_grids"))) tips = sns.load_dataset("tips") tips.head() 複製程式碼
g = sns.FacetGrid(tips, col="time")
g.map(plt.hist, "tip");
複製程式碼
g = sns.FacetGrid(tips, col="sex", hue="smoker")
g.map(plt.scatter, "total_bill", "tip", alpha=.7)
g.add_legend();
複製程式碼
g = sns.FacetGrid(tips, row="smoker", col="time", margin_titles=True)
g.map(sns.regplot, "size", "total_bill", color=".1", fit_reg=False, x_jitter=.1);
複製程式碼
-
熱力圖
%matplotlib inline import matplotlib.pyplot as plt import numpy as np; np.random.seed(0) import seaborn as sns; sns.set() uniform_data = np.random.rand(3, 3) print (uniform_data) heatmap = sns.heatmap(uniform_data) [[ 0.0187898 0.6176355 0.61209572] [ 0.616934 0.94374808 0.6818203 ] [ 0.3595079 0.43703195 0.6976312 ]] 複製程式碼
ax = sns.heatmap(flights, linewidths=.5)
複製程式碼
5 總結
方便複習,整成筆記,內容粗略,勿怪,待完善。
版權宣告:本套技術專欄是作者(秦凱新)平時工作的總結和昇華,通過從真實商業環境抽取案例進行總結和分享,並給出商業應用的調優建議和叢集環境容量規劃等內容,請持續關注本套部落格。QQ郵箱地址:1120746959@qq.com,如有任何學術交流,可隨時聯絡。 秦凱新 於深圳 。