十九、資料整理(6)
作者:Chris Albon
譯者:飛龍
協議:CC BY-NC-SA 4.0
在列中搜尋某個值
# 匯入模組
import pandas as pd
raw_data = {'first_name': ['Jason', 'Jason', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Miller', 'Ali', 'Milner', 'Cooze'],
'age': [42, 42, 36, 24, 73],
'preTestScore': [4, 4, 31, 2, 3],
'postTestScore': [25, 25, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
---|
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Jason |
Miller |
42 |
4 |
25 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
# 在列中尋找值在哪裡
# 檢視 postTestscore 大於 50 的地方
df['preTestScore'].where(df['postTestScore'] > 50)
'''
0 NaN
1 NaN
2 31.0
3 2.0
4 3.0
Name: preTestScore, dtype: float64
'''
選擇包含特定值的行和列
# 匯入模組
import pandas as pd
# 設定 ipython 的最大行顯示
pd.set_option('display.max_row', 1000)
# 設定 ipython 的最大列寬
pd.set_option('display.max_columns', 50)
# 建立示例資料幀
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
name |
reports |
year |
---|
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
# 按照列值抓取行
value_list = ['Tina', 'Molly', 'Jason']
df[df.name.isin(value_list)]
|
name |
reports |
year |
---|
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
# 獲取列值不是某個值的行
df[~df.name.isin(value_list)]
|
name |
reports |
year |
---|
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
選擇具有特定值的行
import pandas as pd
# 建立示例資料幀
data = {'name': ['Jason', 'Molly'],
'country': [['Syria', 'Lebanon'],['Spain', 'Morocco']]}
df = pd.DataFrame(data)
df
|
country |
name |
---|
0 |
[Syria, Lebanon] |
Jason |
1 |
[Spain, Morocco] |
Molly |
df[df['country'].map(lambda country: 'Syria' in country)]
|
country |
name |
---|
0 |
[Syria, Lebanon] |
Jason |
使用多個過濾器選擇行
import pandas as pd
# 建立示例資料幀
data = {'name': ['A', 'B', 'C', 'D', 'E'],
'score': [1,2,3,4,5]}
df = pd.DataFrame(data)
df
|
name |
score |
---|
0 |
A |
1 |
1 |
B |
2 |
2 |
C |
3 |
3 |
D |
4 |
4 |
E |
5 |
# 選擇資料幀的行,其中 df.score 大於 1 且小於 5
df[(df['score'] > 1) & (df['score'] < 5)]
|
name |
score |
---|
1 |
B |
2 |
2 |
C |
3 |
3 |
D |
4 |
根據條件選擇資料幀的行
# 匯入模組
import pandas as pd
import numpy as np
# 建立資料幀
raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan],
'nationality': ['USA', 'USA', 'France', 'UK', 'UK'],
'age': [42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
df
|
first_name |
nationality |
age |
---|
0 |
Jason |
USA |
42 |
1 |
Molly |
USA |
52 |
2 |
NaN |
France |
36 |
3 |
NaN |
UK |
24 |
4 |
NaN |
UK |
70 |
# 方法 1:使用布林變數
# 如果國籍是美國,則變數為 TRUE
american = df['nationality'] == "USA"
# 如果年齡大於 50,則變數為 TRUE
elderly = df['age'] > 50
# 選擇所有國籍為美國且年齡大於 50 的案例
df[american & elderly]
|
first_name |
nationality |
age |
---|
1 |
Molly |
USA |
52 |
# 方法 2:使用變數屬性
# 選擇所有不缺少名字且國籍為美國的案例
df[df['first_name'].notnull() & (df['nationality'] == "USA")]
|
first_name |
nationality |
age |
---|
0 |
Jason |
USA |
42 |
1 |
Molly |
USA |
52 |
資料幀簡單示例
# 匯入模組
import pandas as pd
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
---|
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Molly |
Jacobson |
52 |
24 |
94 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
# 建立第二個資料幀
raw_data_2 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'],
'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'],
'age': [53, 26, 72, 73, 24],
'preTestScore': [13, 52, 72, 26, 26],
'postTestScore': [82, 52, 56, 234, 254]}
df_2 = pd.DataFrame(raw_data_2, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df_2
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
---|
0 |
Sarah |
Mornig |
53 |
13 |
82 |
1 |
Gueniva |
Jaker |
26 |
52 |
52 |
2 |
Know |
Alom |
72 |
72 |
56 |
3 |
Sara |
Ormon |
73 |
26 |
234 |
4 |
Cat |
Koozer |
24 |
26 |
254 |
# 建立第三個資料幀
raw_data_3 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'],
'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'],
'postTestScore_2': [82, 52, 56, 234, 254]}
df_3 = pd.DataFrame(raw_data_3, columns = ['first_name', 'last_name', 'postTestScore_2'])
df_3
|
first_name |
last_name |
postTestScore_2 |
---|
0 |
Sarah |
Mornig |
82 |
1 |
Gueniva |
Jaker |
52 |
2 |
Know |
Alom |
56 |
3 |
Sara |
Ormon |
234 |
4 |
Cat |
Koozer |
254 |
排序資料幀的行
# 匯入模組
import pandas as pd
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [1, 2, 1, 2, 3],
'coverage': [2, 2, 3, 3, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
coverage |
name |
reports |
year |
---|
Cochice |
2 |
Jason |
1 |
2012 |
Pima |
2 |
Molly |
2 |
2012 |
Santa Cruz |
3 |
Tina |
1 |
2013 |
Maricopa |
3 |
Jake |
2 |
2014 |
Yuma |
3 |
Amy |
3 |
2014 |
# 按報告對資料框的行降序排序
df.sort_values(by='reports', ascending=0)
|
coverage |
name |
reports |
year |
---|
Yuma |
3 |
Amy |
3 |
2014 |
Pima |
2 |
Molly |
2 |
2012 |
Maricopa |
3 |
Jake |
2 |
2014 |
Cochice |
2 |
Jason |
1 |
2012 |
Santa Cruz |
3 |
Tina |
1 |
2013 |
# 按 coverage 然後是報告對資料幀的行升序排序
df.sort_values(by=['coverage', 'reports'])
|
coverage |
name |
reports |
year |
---|
Cochice |
2 |
Jason |
1 |
2012 |
Pima |
2 |
Molly |
2 |
2012 |
Santa Cruz |
3 |
Tina |
1 |
2013 |
Maricopa |
3 |
Jake |
2 |
2014 |
Yuma |
3 |
Amy |
3 |
2014 |
將經緯度座標變數拆分為單獨的變數
import pandas as pd
import numpy as np
raw_data = {'geo': ['40.0024, -105.4102', '40.0068, -105.266', '39.9318, -105.2813', np.nan]}
df = pd.DataFrame(raw_data, columns = ['geo'])
df
|
geo |
---|
0 |
40.0024, -105.4102 |
1 |
40.0068, -105.266 |
2 |
39.9318, -105.2813 |
3 |
NaN |
--- |
--- |
# 為要放置的迴圈結果建立兩個列表
lat = []
lon = []
# 對於變數中的每一行
for row in df['geo']:
# Try to,
try:
# 用逗號分隔行,轉換為浮點
# 並將逗號前的所有內容追加到 lat
lat.append(row.split(',')[0])
# 用逗號分隔行,轉換為浮點
# 並將逗號後的所有內容追加到 lon
lon.append(row.split(',')[1])
# 但是如果你得到了錯誤
except:
# 向 lat 新增缺失值
lat.append(np.NaN)
# 向 lon 新增缺失值
lon.append(np.NaN)
# 從 lat 和 lon 建立新的兩列
df['latitude'] = lat
df['longitude'] = lon
df
|
geo |
latitude |
longitude |
---|
0 |
40.0024, -105.4102 |
40.0024 |
-105.4102 |
1 |
40.0068, -105.266 |
40.0068 |
-105.266 |
2 |
39.9318, -105.2813 |
39.9318 |
-105.2813 |
3 |
NaN |
NaN |
NaN |
資料流水線
# 建立一些原始資料
raw_data = [1,2,3,4,5,6,7,8,9,10]
# 定義產生 input+6 的生成器
def add_6(numbers):
for x in numbers:
output = x+6
yield output
# 定義產生 input-2 的生成器
def subtract_2(numbers):
for x in numbers:
output = x-2
yield output
# 定義產生 input*100 的生成器
def multiply_by_100(numbers):
for x in numbers:
output = x*100
yield output
# 流水線的第一步
step1 = add_6(raw_data)
# 流水線的第二步
step2 = subtract_2(step1)
# 流水線的第三步
pipeline = multiply_by_100(step2)
# 原始資料的第一個元素
next(pipeline)
# 500
# 原始資料的第二個元素
next(pipeline)
# 600
# 處理所有資料
for raw_data in pipeline:
print(raw_data)
'''
700
800
900
1000
1100
1200
1300
1400
'''
資料幀中的字串整理
# 匯入模組
import pandas as pd
import numpy as np
import re as re
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'email': ['[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)', np.NAN, '[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)'],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])
df
# 電子郵件列中的哪些字串包含 'gmail'
df['email'].str.contains('gmail')
'''
0 True
1 True
2 NaN
3 False
4 False
Name: email, dtype: object
'''
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
df['email'].str.findall(pattern, flags=re.IGNORECASE)
'''
0 [(jas203, gmail, com)]
1 [(momomolly, gmail, com)]
2 NaN
3 [(battler, milner, com)]
4 [(Ames1234, yahoo, com)]
Name: email, dtype: object
'''
matches = df['email'].str.match(pattern, flags=re.IGNORECASE)
matches
'''
/Users/chrisralbon/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
if __name__ == '__main__':
0 (jas203, gmail, com)
1 (momomolly, gmail, com)
2 NaN
3 (battler, milner, com)
4 (Ames1234, yahoo, com)
Name: email, dtype: object
'''
matches.str[1]
'''
0 gmail
1 gmail
2 NaN
3 milner
4 yahoo
Name: email, dtype: object
'''
和 Pandas 一起使用列表推導式
# 匯入模組
import pandas as pd
# 設定 ipython 的最大行顯示
pd.set_option('display.max_row', 1000)
# 設定 ipython 的最大列寬
pd.set_option('display.max_columns', 50)
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
name |
reports |
year |
---|
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
作為迴圈的列表推導式。
# 建立變數
next_year = []
# 對於 df.years 的每一行
for row in df['year']:
# 為這一行新增 1 並將其附加到 next_year
next_year.append(row + 1)
# 建立 df.next_year
df['next_year'] = next_year
# 檢視資料幀
df
|
name |
reports |
year |
next_year |
---|
Cochice |
Jason |
4 |
2012 |
2013 |
Pima |
Molly |
24 |
2012 |
2013 |
Santa Cruz |
Tina |
31 |
2013 |
2014 |
Maricopa |
Jake |
2 |
2014 |
2015 |
Yuma |
Amy |
3 |
2014 |
2015 |
作為列表推導式。
# 對於 df.year 中的每一行,從行中減去 1
df['previous_year'] = [row-1 for row in df['year']]
df
|
name |
reports |
year |
next_year |
previous_year |
---|
Cochice |
Jason |
4 |
2012 |
2013 |
2011 |
Pima |
Molly |
24 |
2012 |
2013 |
2011 |
Santa Cruz |
Tina |
31 |
2013 |
2014 |
2012 |
Maricopa |
Jake |
2 |
2014 |
2015 |
2013 |
Yuma |
Amy |
3 |
2014 |
2015 |
2013 |
使用 Seaborn 來視覺化資料幀
import pandas as pd
%matplotlib inline
import random
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame()
df['x'] = random.sample(range(1, 100), 25)
df['y'] = random.sample(range(1, 100), 25)
df.head()
|
x |
y |
---|
0 |
18 |
25 |
1 |
42 |
67 |
2 |
52 |
77 |
3 |
4 |
34 |
4 |
14 |
69 |
# 散點圖
sns.lmplot('x', 'y', data=df, fit_reg=False)
# <seaborn.axisgrid.FacetGrid at 0x114563b00>
# 密度圖
sns.kdeplot(df.y)
# <matplotlib.axes._subplots.AxesSubplot at 0x113ea2ef0>
sns.kdeplot(df.y, df.x)
# <matplotlib.axes._subplots.AxesSubplot at 0x113d7fef0>
sns.distplot(df.x)
# <matplotlib.axes._subplots.AxesSubplot at 0x114294160>
# 直方圖
plt.hist(df.x, alpha=.3)
sns.rugplot(df.x);
# 箱形圖
sns.boxplot([df.y, df.x])
# <matplotlib.axes._subplots.AxesSubplot at 0x1142b8b38>
# 提琴圖
sns.violinplot([df.y, df.x])
# <matplotlib.axes._subplots.AxesSubplot at 0x114444a58>
# 熱力圖
sns.heatmap([df.y, df.x], annot=True, fmt="d")
# <matplotlib.axes._subplots.AxesSubplot at 0x114530c88>
# 聚類圖
sns.clustermap(df)
# <seaborn.matrix.ClusterGrid at 0x116f313c8>
Pandas 資料結構
# 匯入模組
import pandas as pd
序列 101
序列是一維陣列(類似 R 的向量)。
# 建立 floodingReports 數量的序列
floodingReports = pd.Series([5, 6, 2, 9, 12])
floodingReports
'''
0 5
1 6
2 2
3 9
4 12
dtype: int64
'''
請注意,第一列數字(0 到 4)是索引。
# 將縣名設定為 floodingReports 序列的索引
floodingReports = pd.Series([5, 6, 2, 9, 12], index=['Cochise County', 'Pima County', 'Santa Cruz County', 'Maricopa County', 'Yuma County'])
floodingReports
'''
Cochise County 5
Pima County 6
Santa Cruz County 2
Maricopa County 9
Yuma County 12
dtype: int64
'''
floodingReports['Cochise County']
# 5
floodingReports[floodingReports > 6]
'''
Maricopa County 9
Yuma County 12
dtype: int64
'''
從字典中建立 Pandas 序列。
注意:執行此操作時,字典的鍵將成為序列索引。
# 建立字典
fireReports_dict = {'Cochise County': 12, 'Pima County': 342, 'Santa Cruz County': 13, 'Maricopa County': 42, 'Yuma County' : 52}
# 將字典轉換為 pd.Series,然後檢視它
fireReports = pd.Series(fireReports_dict); fireReports
'''
Cochise County 12
Maricopa County 42
Pima County 342
Santa Cruz County 13
Yuma County 52
dtype: int64
'''
fireReports.index = ["Cochice", "Pima", "Santa Cruz", "Maricopa", "Yuma"]
fireReports
'''
Cochice 12
Pima 42
Santa Cruz 342
Maricopa 13
Yuma 52
dtype: int64
'''
資料幀 101
資料幀就像 R 的資料幀。
# 從等長列表或 NumPy 陣列的字典中建立資料幀
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
df
|
county |
reports |
year |
---|
0 |
Cochice |
4 |
2012 |
1 |
Pima |
24 |
2012 |
2 |
Santa Cruz |
31 |
2013 |
3 |
Maricopa |
2 |
2014 |
4 |
Yuma |
3 |
2014 |
# 使用 columns 屬性設定列的順序
dfColumnOrdered = pd.DataFrame(data, columns=['county', 'year', 'reports'])
dfColumnOrdered
|
county |
year |
reports |
---|
0 |
Cochice |
2012 |
4 |
1 |
Pima |
2012 |
24 |
2 |
Santa Cruz |
2013 |
31 |
3 |
Maricopa |
2014 |
2 |
4 |
Yuma |
2014 |
3 |
# 新增一列
dfColumnOrdered['newsCoverage'] = pd.Series([42.3, 92.1, 12.2, 39.3, 30.2])
dfColumnOrdered
|
county |
year |
reports |
newsCoverage |
---|
0 |
Cochice |
2012 |
4 |
42.3 |
1 |
Pima |
2012 |
24 |
92.1 |
2 |
Santa Cruz |
2013 |
31 |
12.2 |
3 |
Maricopa |
2014 |
2 |
39.3 |
4 |
Yuma |
2014 |
3 |
30.2 |
# 刪除一列
del dfColumnOrdered['newsCoverage']
dfColumnOrdered
|
county |
year |
reports |
---|
0 |
Cochice |
2012 |
4 |
1 |
Pima |
2012 |
24 |
2 |
Santa Cruz |
2013 |
31 |
3 |
Maricopa |
2014 |
2 |
4 |
Yuma |
2014 |
3 |
# 轉置資料幀
dfColumnOrdered.T
|
0 |
1 |
2 |
3 |
4 |
---|
county |
Cochice |
Pima |
Santa Cruz |
Maricopa |
Yuma |
year |
2012 |
2012 |
2013 |
2014 |
2014 |
reports |
4 |
24 |
31 |
2 |
3 |
Pandas 時間序列基礎
# 匯入模組
from datetime import datetime
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as pyplot
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'battle_deaths': [34, 25, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths'])
print(df)
'''
date battle_deaths
0 2014-05-01 18:47:05.069722 34
1 2014-05-01 18:47:05.119994 25
2 2014-05-02 18:47:05.178768 26
3 2014-05-02 18:47:05.230071 15
4 2014-05-02 18:47:05.230071 15
5 2014-05-02 18:47:05.280592 14
6 2014-05-03 18:47:05.332662 26
7 2014-05-03 18:47:05.385109 25
8 2014-05-04 18:47:05.436523 62
9 2014-05-04 18:47:05.486877 41
'''
df['date'] = pd.to_datetime(df['date'])
df.index = df['date']
del df['date']
df
|
battle_deaths |
---|
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 檢視 2014 年的所有觀測
df['2014']
|
battle_deaths |
---|
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 檢視 2014 年 5 月的所有觀測
df['2014-05']
|
battle_deaths |
---|
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 檢視 2014.5.3 的所有觀測
df[datetime(2014, 5, 3):]
|
battle_deaths |
---|
date |
|
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
Observations between May 3rd and May 4th
# 檢視 2014.5.3~4 的所有觀測
df['5/3/2014':'5/4/2014']
|
battle_deaths |
---|
date |
|
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 截斷 2014.5.2 之後的觀測
df.truncate(after='5/3/2014')
|
battle_deaths |
---|
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
# 2014.5 的觀測
df['5-2014']
|
battle_deaths |
---|
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 計算每個時間戳的觀測數
df.groupby(level=0).count()
|
battle_deaths |
---|
date |
|
2014-05-01 18:47:05.069722 |
1 |
2014-05-01 18:47:05.119994 |
1 |
2014-05-02 18:47:05.178768 |
1 |
2014-05-02 18:47:05.230071 |
2 |
2014-05-02 18:47:05.280592 |
1 |
2014-05-03 18:47:05.332662 |
1 |
2014-05-03 18:47:05.385109 |
1 |
2014-05-04 18:47:05.436523 |
1 |
2014-05-04 18:47:05.486877 |
1 |
# 每天的 battle_deaths 均值
df.resample('D').mean()
|
battle_deaths |
---|
date |
|
2014-05-01 |
29.5 |
2014-05-02 |
17.5 |
2014-05-03 |
25.5 |
2014-05-04 |
51.5 |
# 每天的 battle_deaths 總數
df.resample('D').sum()
|
battle_deaths |
---|
date |
|
2014-05-01 |
59 |
2014-05-02 |
70 |
2014-05-03 |
51 |
2014-05-04 |
103 |
# 繪製每天的總死亡人數
df.resample('D').sum().plot()
# <matplotlib.axes._subplots.AxesSubplot at 0x11187a940>