資料科學和人工智慧技術筆記 十九、資料整理(6)

weixin_34378969發表於2019-01-01

十九、資料整理(6)

作者:Chris Albon

譯者:飛龍

協議:CC BY-NC-SA 4.0

在列中搜尋某個值

# 匯入模組
import pandas as pd

raw_data = {'first_name': ['Jason', 'Jason', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Miller', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 42, 36, 24, 73], 
        'preTestScore': [4, 4, 31, 2, 3],
        'postTestScore': [25, 25, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
first_name last_name age preTestScore postTestScore
0 Jason Miller 42 4 25
1 Jason Miller 42 4 25
2 Tina Ali 36 31 57
3 Jake Milner 24 2 62
4 Amy Cooze 73 3 70
# 在列中尋找值在哪裡
# 檢視 postTestscore 大於 50 的地方
df['preTestScore'].where(df['postTestScore'] > 50)

'''
0     NaN
1     NaN
2    31.0
3     2.0
4     3.0
Name: preTestScore, dtype: float64 
'''

選擇包含特定值的行和列

# 匯入模組
import pandas as pd

# 設定 ipython 的最大行顯示
pd.set_option('display.max_row', 1000)

# 設定 ipython 的最大列寬
pd.set_option('display.max_columns', 50)

# 建立示例資料幀
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
name reports year
Cochice Jason 4 2012
Pima Molly 24 2012
Santa Cruz Tina 31 2013
Maricopa Jake 2 2014
Yuma Amy 3 2014
# 按照列值抓取行
value_list = ['Tina', 'Molly', 'Jason']

df[df.name.isin(value_list)]
name reports year
Cochice Jason 4 2012
Pima Molly 24 2012
Santa Cruz Tina 31 2013
# 獲取列值不是某個值的行
df[~df.name.isin(value_list)]
name reports year
Maricopa Jake 2 2014
Yuma Amy 3 2014

選擇具有特定值的行

import pandas as pd

# 建立示例資料幀
data = {'name': ['Jason', 'Molly'], 
        'country': [['Syria', 'Lebanon'],['Spain', 'Morocco']]}
df = pd.DataFrame(data)
df
country name
0 [Syria, Lebanon] Jason
1 [Spain, Morocco] Molly
df[df['country'].map(lambda country: 'Syria' in country)]
country name
0 [Syria, Lebanon] Jason

使用多個過濾器選擇行

import pandas as pd

# 建立示例資料幀
data = {'name': ['A', 'B', 'C', 'D', 'E'], 
        'score': [1,2,3,4,5]}
df = pd.DataFrame(data)
df
name score
0 A 1
1 B 2
2 C 3
3 D 4
4 E 5
# 選擇資料幀的行,其中 df.score 大於 1 且小於 5
df[(df['score'] > 1) & (df['score'] < 5)]
name score
1 B 2
2 C 3
3 D 4

根據條件選擇資料幀的行

# 匯入模組
import pandas as pd
import numpy as np

# 建立資料幀
raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan], 
        'nationality': ['USA', 'USA', 'France', 'UK', 'UK'], 
        'age': [42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
df
first_name nationality age
0 Jason USA 42
1 Molly USA 52
2 NaN France 36
3 NaN UK 24
4 NaN UK 70
# 方法 1:使用布林變數
# 如果國籍是美國,則變數為 TRUE
american = df['nationality'] == "USA"

# 如果年齡大於 50,則變數為 TRUE
elderly = df['age'] > 50

# 選擇所有國籍為美國且年齡大於 50 的案例
df[american & elderly]
first_name nationality age
1 Molly USA 52
# 方法 2:使用變數屬性
# 選擇所有不缺少名字且國籍為美國的案例
df[df['first_name'].notnull() & (df['nationality'] == "USA")]
first_name nationality age
0 Jason USA 42
1 Molly USA 52

資料幀簡單示例

# 匯入模組
import pandas as pd

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
first_name last_name age preTestScore postTestScore
0 Jason Miller 42 4 25
1 Molly Jacobson 52 24 94
2 Tina Ali 36 31 57
3 Jake Milner 24 2 62
4 Amy Cooze 73 3 70
# 建立第二個資料幀
raw_data_2 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'], 
        'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'], 
        'age': [53, 26, 72, 73, 24], 
        'preTestScore': [13, 52, 72, 26, 26],
        'postTestScore': [82, 52, 56, 234, 254]}
df_2 = pd.DataFrame(raw_data_2, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df_2
first_name last_name age preTestScore postTestScore
0 Sarah Mornig 53 13 82
1 Gueniva Jaker 26 52 52
2 Know Alom 72 72 56
3 Sara Ormon 73 26 234
4 Cat Koozer 24 26 254
# 建立第三個資料幀
raw_data_3 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'], 
        'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'],
         'postTestScore_2': [82, 52, 56, 234, 254]}
df_3 = pd.DataFrame(raw_data_3, columns = ['first_name', 'last_name', 'postTestScore_2'])
df_3
first_name last_name postTestScore_2
0 Sarah Mornig 82
1 Gueniva Jaker 52
2 Know Alom 56
3 Sara Ormon 234
4 Cat Koozer 254

排序資料幀的行

# 匯入模組
import pandas as pd

data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [1, 2, 1, 2, 3],
        'coverage': [2, 2, 3, 3, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
coverage name reports year
Cochice 2 Jason 1 2012
Pima 2 Molly 2 2012
Santa Cruz 3 Tina 1 2013
Maricopa 3 Jake 2 2014
Yuma 3 Amy 3 2014
# 按報告對資料框的行降序排序
df.sort_values(by='reports', ascending=0)
coverage name reports year
Yuma 3 Amy 3 2014
Pima 2 Molly 2 2012
Maricopa 3 Jake 2 2014
Cochice 2 Jason 1 2012
Santa Cruz 3 Tina 1 2013
# 按 coverage 然後是報告對資料幀的行升序排序
df.sort_values(by=['coverage', 'reports'])
coverage name reports year
Cochice 2 Jason 1 2012
Pima 2 Molly 2 2012
Santa Cruz 3 Tina 1 2013
Maricopa 3 Jake 2 2014
Yuma 3 Amy 3 2014

將經緯度座標變數拆分為單獨的變數

import pandas as pd
import numpy as np

raw_data = {'geo': ['40.0024, -105.4102', '40.0068, -105.266', '39.9318, -105.2813', np.nan]}
df = pd.DataFrame(raw_data, columns = ['geo'])
df
geo
0 40.0024, -105.4102
1 40.0068, -105.266
2 39.9318, -105.2813
3 NaN
--- ---
# 為要放置的迴圈結果建立兩個列表
lat = []
lon = []

# 對於變數中的每一行
for row in df['geo']:
    # Try to,
    try:
        # 用逗號分隔行,轉換為浮點
        # 並將逗號前的所有內容追加到 lat
        lat.append(row.split(',')[0])
        # 用逗號分隔行,轉換為浮點
        # 並將逗號後的所有內容追加到 lon
        lon.append(row.split(',')[1])
    # 但是如果你得到了錯誤
    except:
        # 向 lat 新增缺失值
        lat.append(np.NaN)
        # 向 lon 新增缺失值
        lon.append(np.NaN)

# 從 lat 和 lon 建立新的兩列
df['latitude'] = lat
df['longitude'] = lon

df
geo latitude longitude
0 40.0024, -105.4102 40.0024 -105.4102
1 40.0068, -105.266 40.0068 -105.266
2 39.9318, -105.2813 39.9318 -105.2813
3 NaN NaN NaN

資料流水線

# 建立一些原始資料
raw_data = [1,2,3,4,5,6,7,8,9,10]

# 定義產生 input+6 的生成器
def add_6(numbers):
    for x in numbers:
        output = x+6
        yield output

# 定義產生 input-2 的生成器
def subtract_2(numbers):
    for x in numbers:
        output = x-2
        yield output

# 定義產生 input*100 的生成器
def multiply_by_100(numbers):
    for x in numbers:
        output = x*100
        yield output

# 流水線的第一步
step1 = add_6(raw_data)

# 流水線的第二步
step2 = subtract_2(step1)

# 流水線的第三步
pipeline = multiply_by_100(step2)

# 原始資料的第一個元素
next(pipeline)

# 500 

# 原始資料的第二個元素
next(pipeline)

# 600 

# 處理所有資料
for raw_data in pipeline:
    print(raw_data)

'''
700
800
900
1000
1100
1200
1300
1400
'''

資料幀中的字串整理

# 匯入模組
import pandas as pd
import numpy as np
import re as re

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'email': ['[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)', np.NAN, '[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)'], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])
df
first_name last_name email preTestScore postTestScore
0 Jason Miller [email protected] 4 25
1 Molly Jacobson [email protected] 24 94
2 Tina Ali NaN 31 57
3 Jake Milner [email protected] 2 62
4 Amy Cooze [email protected] 3 70
# 電子郵件列中的哪些字串包含 'gmail'
df['email'].str.contains('gmail')

'''
0     True
1     True
2      NaN
3    False
4    False
Name: email, dtype: object 
'''

pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

df['email'].str.findall(pattern, flags=re.IGNORECASE)

'''
0       [(jas203, gmail, com)]
1    [(momomolly, gmail, com)]
2                          NaN
3     [(battler, milner, com)]
4     [(Ames1234, yahoo, com)]
Name: email, dtype: object 
'''

matches = df['email'].str.match(pattern, flags=re.IGNORECASE)
matches

'''
/Users/chrisralbon/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
  if __name__ == '__main__':

0       (jas203, gmail, com)
1    (momomolly, gmail, com)
2                        NaN
3     (battler, milner, com)
4     (Ames1234, yahoo, com)
Name: email, dtype: object 
'''

matches.str[1]

'''
0     gmail
1     gmail
2       NaN
3    milner
4     yahoo
Name: email, dtype: object 
'''

和 Pandas 一起使用列表推導式

# 匯入模組
import pandas as pd

# 設定 ipython 的最大行顯示
pd.set_option('display.max_row', 1000)

# 設定 ipython 的最大列寬
pd.set_option('display.max_columns', 50)

data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
name reports year
Cochice Jason 4 2012
Pima Molly 24 2012
Santa Cruz Tina 31 2013
Maricopa Jake 2 2014
Yuma Amy 3 2014

作為迴圈的列表推導式。

# 建立變數
next_year = []

# 對於 df.years 的每一行
for row in df['year']:
    # 為這一行新增 1 並將其附加到 next_year
    next_year.append(row + 1)

# 建立 df.next_year
df['next_year'] = next_year

# 檢視資料幀
df
name reports year next_year
Cochice Jason 4 2012 2013
Pima Molly 24 2012 2013
Santa Cruz Tina 31 2013 2014
Maricopa Jake 2 2014 2015
Yuma Amy 3 2014 2015

作為列表推導式。

# 對於 df.year 中的每一行,從行中減去 1
df['previous_year'] = [row-1 for row in df['year']]

df
name reports year next_year previous_year
Cochice Jason 4 2012 2013 2011
Pima Molly 24 2012 2013 2011
Santa Cruz Tina 31 2013 2014 2012
Maricopa Jake 2 2014 2015 2013
Yuma Amy 3 2014 2015 2013

使用 Seaborn 來視覺化資料幀

import pandas as pd
%matplotlib inline
import random
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.DataFrame()

df['x'] = random.sample(range(1, 100), 25)
df['y'] = random.sample(range(1, 100), 25)

df.head()
x y
0 18 25
1 42 67
2 52 77
3 4 34
4 14 69
# 散點圖
sns.lmplot('x', 'y', data=df, fit_reg=False)

# <seaborn.axisgrid.FacetGrid at 0x114563b00> 
118142-dc8ffcd715980106.png
png
# 密度圖
sns.kdeplot(df.y)

# <matplotlib.axes._subplots.AxesSubplot at 0x113ea2ef0> 
118142-a1d90a3d10ed2185.png
png
sns.kdeplot(df.y, df.x)

# <matplotlib.axes._subplots.AxesSubplot at 0x113d7fef0> 
118142-005505f4483c2add.png
png
sns.distplot(df.x)

# <matplotlib.axes._subplots.AxesSubplot at 0x114294160> 
118142-f98c49b9c3672710.png
png
# 直方圖
plt.hist(df.x, alpha=.3)
sns.rugplot(df.x);
118142-0cd344a5709d657b.png
png
# 箱形圖
sns.boxplot([df.y, df.x])

# <matplotlib.axes._subplots.AxesSubplot at 0x1142b8b38> 
118142-c86f515b9cced862.png
png
# 提琴圖
sns.violinplot([df.y, df.x])

# <matplotlib.axes._subplots.AxesSubplot at 0x114444a58> 
118142-ecbdf2b008ed235b.png
png
# 熱力圖
sns.heatmap([df.y, df.x], annot=True, fmt="d")

# <matplotlib.axes._subplots.AxesSubplot at 0x114530c88> 
118142-cdf2256694892ef5.png
png
# 聚類圖
sns.clustermap(df)

# <seaborn.matrix.ClusterGrid at 0x116f313c8> 
118142-9fe068a06d08d493.png
png

Pandas 資料結構

# 匯入模組
import pandas as pd

序列 101

序列是一維陣列(類似 R 的向量)。

# 建立 floodingReports 數量的序列
floodingReports = pd.Series([5, 6, 2, 9, 12])
floodingReports

'''
0     5
1     6
2     2
3     9
4    12
dtype: int64 
'''

請注意,第一列數字(0 到 4)是索引。

# 將縣名設定為 floodingReports 序列的索引
floodingReports = pd.Series([5, 6, 2, 9, 12], index=['Cochise County', 'Pima County', 'Santa Cruz County', 'Maricopa County', 'Yuma County'])
floodingReports

'''
Cochise County        5
Pima County           6
Santa Cruz County     2
Maricopa County       9
Yuma County          12
dtype: int64 
'''

floodingReports['Cochise County']

# 5 

floodingReports[floodingReports > 6]

'''
Maricopa County     9
Yuma County        12
dtype: int64 
'''

從字典中建立 Pandas 序列。

注意:執行此操作時,字典的鍵將成為序列索引。

# 建立字典
fireReports_dict = {'Cochise County': 12, 'Pima County': 342, 'Santa Cruz County': 13, 'Maricopa County': 42, 'Yuma County' : 52}

# 將字典轉換為 pd.Series,然後檢視它
fireReports = pd.Series(fireReports_dict); fireReports

'''
Cochise County        12
Maricopa County       42
Pima County          342
Santa Cruz County     13
Yuma County           52
dtype: int64 
'''

fireReports.index = ["Cochice", "Pima", "Santa Cruz", "Maricopa", "Yuma"]
fireReports

'''
Cochice        12
Pima           42
Santa Cruz    342
Maricopa       13
Yuma           52
dtype: int64 
'''

資料幀 101

資料幀就像 R 的資料幀。

# 從等長列表或 NumPy 陣列的字典中建立資料幀
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
df
county reports year
0 Cochice 4 2012
1 Pima 24 2012
2 Santa Cruz 31 2013
3 Maricopa 2 2014
4 Yuma 3 2014
# 使用 columns 屬性設定列的順序
dfColumnOrdered = pd.DataFrame(data, columns=['county', 'year', 'reports'])
dfColumnOrdered
county year reports
0 Cochice 2012 4
1 Pima 2012 24
2 Santa Cruz 2013 31
3 Maricopa 2014 2
4 Yuma 2014 3
# 新增一列
dfColumnOrdered['newsCoverage'] = pd.Series([42.3, 92.1, 12.2, 39.3, 30.2])
dfColumnOrdered
county year reports newsCoverage
0 Cochice 2012 4 42.3
1 Pima 2012 24 92.1
2 Santa Cruz 2013 31 12.2
3 Maricopa 2014 2 39.3
4 Yuma 2014 3 30.2
# 刪除一列
del dfColumnOrdered['newsCoverage']
dfColumnOrdered
county year reports
0 Cochice 2012 4
1 Pima 2012 24
2 Santa Cruz 2013 31
3 Maricopa 2014 2
4 Yuma 2014 3
# 轉置資料幀
dfColumnOrdered.T
0 1 2 3 4
county Cochice Pima Santa Cruz Maricopa Yuma
year 2012 2012 2013 2014 2014
reports 4 24 31 2 3

Pandas 時間序列基礎

# 匯入模組
from datetime import datetime
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as pyplot

data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'], 
        'battle_deaths': [34, 25, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths'])
print(df)

'''
 date  battle_deaths
0  2014-05-01 18:47:05.069722             34
1  2014-05-01 18:47:05.119994             25
2  2014-05-02 18:47:05.178768             26
3  2014-05-02 18:47:05.230071             15
4  2014-05-02 18:47:05.230071             15
5  2014-05-02 18:47:05.280592             14
6  2014-05-03 18:47:05.332662             26
7  2014-05-03 18:47:05.385109             25
8  2014-05-04 18:47:05.436523             62
9  2014-05-04 18:47:05.486877             41 
'''

df['date'] = pd.to_datetime(df['date'])

df.index = df['date']
del df['date']
df
battle_deaths
date
2014-05-01 18:47:05.069722 34
2014-05-01 18:47:05.119994 25
2014-05-02 18:47:05.178768 26
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.280592 14
2014-05-03 18:47:05.332662 26
2014-05-03 18:47:05.385109 25
2014-05-04 18:47:05.436523 62
2014-05-04 18:47:05.486877 41
# 檢視 2014 年的所有觀測
df['2014']
battle_deaths
date
2014-05-01 18:47:05.069722 34
2014-05-01 18:47:05.119994 25
2014-05-02 18:47:05.178768 26
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.280592 14
2014-05-03 18:47:05.332662 26
2014-05-03 18:47:05.385109 25
2014-05-04 18:47:05.436523 62
2014-05-04 18:47:05.486877 41
# 檢視 2014 年 5 月的所有觀測
df['2014-05']
battle_deaths
date
2014-05-01 18:47:05.069722 34
2014-05-01 18:47:05.119994 25
2014-05-02 18:47:05.178768 26
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.280592 14
2014-05-03 18:47:05.332662 26
2014-05-03 18:47:05.385109 25
2014-05-04 18:47:05.436523 62
2014-05-04 18:47:05.486877 41
# 檢視 2014.5.3 的所有觀測
df[datetime(2014, 5, 3):]
battle_deaths
date
2014-05-03 18:47:05.332662 26
2014-05-03 18:47:05.385109 25
2014-05-04 18:47:05.436523 62
2014-05-04 18:47:05.486877 41

Observations between May 3rd and May 4th

# 檢視 2014.5.3~4 的所有觀測
df['5/3/2014':'5/4/2014']
battle_deaths
date
2014-05-03 18:47:05.332662 26
2014-05-03 18:47:05.385109 25
2014-05-04 18:47:05.436523 62
2014-05-04 18:47:05.486877 41
# 截斷 2014.5.2 之後的觀測
df.truncate(after='5/3/2014')
battle_deaths
date
2014-05-01 18:47:05.069722 34
2014-05-01 18:47:05.119994 25
2014-05-02 18:47:05.178768 26
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.280592 14
# 2014.5 的觀測
df['5-2014']
battle_deaths
date
2014-05-01 18:47:05.069722 34
2014-05-01 18:47:05.119994 25
2014-05-02 18:47:05.178768 26
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.230071 15
2014-05-02 18:47:05.280592 14
2014-05-03 18:47:05.332662 26
2014-05-03 18:47:05.385109 25
2014-05-04 18:47:05.436523 62
2014-05-04 18:47:05.486877 41
# 計算每個時間戳的觀測數
df.groupby(level=0).count()
battle_deaths
date
2014-05-01 18:47:05.069722 1
2014-05-01 18:47:05.119994 1
2014-05-02 18:47:05.178768 1
2014-05-02 18:47:05.230071 2
2014-05-02 18:47:05.280592 1
2014-05-03 18:47:05.332662 1
2014-05-03 18:47:05.385109 1
2014-05-04 18:47:05.436523 1
2014-05-04 18:47:05.486877 1
# 每天的 battle_deaths 均值
df.resample('D').mean()
battle_deaths
date
2014-05-01 29.5
2014-05-02 17.5
2014-05-03 25.5
2014-05-04 51.5
# 每天的 battle_deaths 總數
df.resample('D').sum()
battle_deaths
date
2014-05-01 59
2014-05-02 70
2014-05-03 51
2014-05-04 103
# 繪製每天的總死亡人數
df.resample('D').sum().plot()

# <matplotlib.axes._subplots.AxesSubplot at 0x11187a940> 
118142-f166c473baf1857f.png
png

相關文章