Pandas

mklzc發表於2024-10-22

Pandas 基礎

Pandas Series

類似於一維陣列,但可以提供索引。

Series 的建立


import pandas as pd

a = ["Google", "Runoob", "Wiki"]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar["y"])

sites = {1 : "Google", 2 : "Runoob", 3 : "Wiki"}
myvar = pd.Series(sites)
print(myvar)

myvar = pd.Series(sites, index = [1, 2])
print(myvar)

import numpy as np
myvar = pd.Series(np.array([1, 2, 3, 4]))
print(myvar)

Series 的基本操作

import pandas as pd
s = pd.Series([12, 15, 13, 11])

# 遍歷
for index, value in s.items():
    print(f"Index: {index}, Value: {value}")

print(s[1:4])
print(s[:3])

s[4] = 1 # 新增
del s[0] # 刪除
s_dropped = s.drop(1) # 返回刪除索引1的新series
print(s_dropped)

Series 的基本運算

import pandas as pd

s = pd.Series([12, 15, 13, 11])
print(s * 2)
print(s[s > 12])

import numpy as np
print(np.sqrt(s))

print(s.sum(), s.mean(), s.min(), s.max(), s.std())

Series 的屬性和方法

import pandas as pd

s = pd.Series([12, 15, 13, 11], index = ['a', 'b', 'c', 'd'])

print(s.index)
print(s.values)
print(s.describe())

# 獲取最大最小值的索引
print(s.idxmax(), s.idxmin())
print(s.shape)
print(s > 2)
print(s.astype('float64'))

Pandas DataFrame

DataFrame 是 Pandas 中的另一個核心資料結構,用於表示二維表格型資料。

DataFrame 的建立

import pandas as pd
import numpy as np
#列表建立
data = [['Google', 10], ['Runoob', 12], ['wiki', 13]]
df = pd.DataFrame(data, columns = ['Sites', 'Age']) # columns是列索引,index是行索引
df['Sites'] = df['Sites'].astype(str)
df['Age'] = df['Age'].astype(float)
print(df)

#字典建立
dict = {'Sites' : ['Google', 'Runoob', 'Wiki'], 'Age' : [10, 12, 13]}
df = pd.DataFrame(dict)
print(df)

#Ndarray建立
ndarray_data = np.array([['Google', 10], ['Runoob', 12], ['Wiki', 13]])
df = pd.DataFrame(ndarray_data, columns = ['Sites', 'Age'])
print(df)

DataFrame 的基本操作

import pandas as pd

data = {
    "calories" : [420, 380, 390],
    "duration" : [50, 40, 45]
}

df = pd.DataFrame(data)

print(df.loc[0])
print(df.loc[1])
print(df.loc[[0, 1]]) # 返回第一行和第二行

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
print(df.loc["day2"])

print(df["calories"]) # 查詢列
print(df.loc[:, 'calories'])
print(df.iloc[:, 0])
import pandas as pd

data = {
    "calories" : [420, 380, 390],
    "duration" : [50, 40, 45]
}

df = pd.DataFrame(data)
print(df.shape)
print(df.columns)
print(df.index)
print(df.head())
print(df.tail())
print(df.info())
print(df.describe())
print(df.mean())
print(df.sum())
import pandas as pd

data = {
    "calories" : [420, 380, 390],
    "duration" : [50, 40, 45]
}

df = pd.DataFrame(data)
df['calories'] = [420, 390, 390] # 修改列
df['NewColumn'] = [100, 200, 300] # 建立新列
print(df)

df.loc[3] = [1, 2, 3] #新增新行
print(df)

new_row = pd.DataFrame([[440, 40, 400]], columns = ['calories', 'duration', 'NewColumn'])
df = pd.concat([df, new_row], ignore_index = True)
print(df)

df_dropped = df.drop('NewColumn', axis = 1) # 刪除列
print(df_dropped)

df_dropped = df.drop(3, axis = 0)
print(df_dropped)

print(df[df['calories'] > 400])
import pandas as pd
df1 = pd.DataFrame([[1, 2, 3]], columns = ['columns1', 'columns2', 'columns3'])
df2 = pd.DataFrame([[1, 5, 6]], columns = ['columns1', 'columns2', 'columns3'])
print(pd.concat([df1, df2], ignore_index = True)) # 縱向合併

df1 = pd.DataFrame([[1, 2, 3]], columns = ['columns1', 'columns2', 'columns3'])
df2 = pd.DataFrame([[1, 5, 6]], columns = ['columns1', 'columns5', 'columns6'])
print(pd.merge(df1, df2, on = 'columns1'))

pandas 與 csv

csv 檔案的存取

import pandas as pd
import os

FilePath = os.path.abspath('.')
os.chdir(FilePath)

df = pd.read_csv('DataForClassify.csv')
df.to_csv('DFC_cp.csv')

資料處理

print(df.head()) # default = 5
print(df.head(10))
print(df.tail())
print(df.tail(10))

pandas 與 json

讀取json檔案

import pandas as pd
df = pd.read_json('test.json')
# df = pd.read_json(URL)

讀取內嵌的json

import pandas as pd
import os
import json

FilePath = os.path.abspath('.')
os.chdir(FilePath)

with open('nest.json', 'r') as f:
    data = json.loads(f.read())

df_nest = pd.json_normalize(data, record_path = ['students'], meta = ['class'])
print(df_nest)

glom

import pandas as pd
from glom import glom

df = pd.read_json('nest.json')
data = df['students'].apply(lambda row: glom(row, 'grade.math'))
print(data)