利用pandas進行資料清洗,利用神經網路預測成績(參考資料酷客,學生成績預測)

你今天學習了嘛發表於2020-05-09

老師佈置的大資料處理作業,需要採集專業所有學生各科成績,學分績點、成績排名,並根據成績資料,對本專業同學的學習整體情況進行分析說明。

# 讀取資料
import pandas as pd
All_Data= pd.read_csv('./input/18BigData.csv',encoding = "GBK")

# 然後資料預處理,由於包含姓名資訊,就不放出來了。

# 按行遍歷,算出學分績點
Data['績點'] = None
for index, row in Data.iterrows():
    if int(Data.loc[index,'總分'])<60:
        Data.loc[index,'績點']='0'
    else:
        Data.loc[index,'績點']=str(round(float(Data.loc[index,'總分'])/10-5,4))
Data.head()

# 視覺化1
s = pd.DataFrame({'18大資料1':[C_class1['總分'].mean(),CC_class1['總分'].mean(),java_class1['總分'].mean(),ShuJuJieGou_class1['總分'].mean()],'18大資料2':[C_class2['總分'].mean(),CC_class2['總分'].mean(),java_class2['總分'].mean(),ShuJuJieGou_class2['總分'].mean()],'18大資料3':[C_class3['總分'].mean(),CC_class3['總分'].mean(),java_class3['總分'].mean(),ShuJuJieGou_class3['總分'].mean()]},index=['C','C++','JAVA','資料結構'])
s.plot(kind='barh')

# 視覺化2
course=[各種課程名]
Data1 = Data[Data['班級']=='18大資料1']
Data2 = Data[Data['班級']=='18大資料2']
Data3 = Data[Data['班級']=='18大資料3']
l1=[]
l2=[]
l3=[]
for s in course:
    l1.append(Data1[Data1['課程名']==s]['總分'].mean())
    l2.append(Data2[Data2['課程名']==s]['總分'].mean())
    l3.append(Data3[Data3['課程名']==s]['總分'].mean())
s1 = pd.DataFrame(l1,index=course,columns=['18大資料1各科均分'])
s1.plot(kind='barh')
s2 = pd.DataFrame(l2,index=course,columns=['18大資料2各科均分'])
s2.plot(kind='barh')
s3 = pd.DataFrame(l3,index=course,columns=['18大資料3各科均分'])
s3.plot(kind='barh')

# 算自己的平均學分績點,My_Data可以通過自己姓名提取
test1 = My_Data[My_Data['課程名'].isin(My_Data['課程名'].tolist()[0:11])]
test2 = My_Data[My_Data['課程名'].isin(My_Data['課程名'].tolist()[11:26])]
test3 = My_Data[My_Data['課程名'].isin(My_Data['課程名'].tolist()[26:])]
JiDian = [0,0,0]
m1 = m2 = m3 = 0
n1 = n2 = n3 = 0
# 按行遍歷,算出學分績點
for index, row in test1.iterrows():
    m1 += float(test1.loc[index,'學分'])*float(test1.loc[index,'績點'])
    n1 += float(test1.loc[index,'學分'])
for index, row in test2.iterrows():
    m2 += float(test2.loc[index,'學分'])*float(test2.loc[index,'績點'])
    n2 += float(test2.loc[index,'學分'])
for index, row in test3.iterrows():
    m3 += float(test3.loc[index,'學分'])*float(test3.loc[index,'績點'])
    n3 += float(test3.loc[index,'學分'])
JiDian[0] = round(m1/n1,4)
JiDian[1] = round(m2/n2,4)
JiDian[2] = round(m3/n3,4)
J = pd.DataFrame(JiDian,index=['1','2','3'],columns=['平均學分績點'])
J

對2班C,C++,JAVA總分進行分析,用C,C++預測JAVA,展現預測誤差:

from sklearn.cross_validation import train_test_split
k = pd.merge(C_class2[['學號','總分']],CC_class2[['學號','總分']],on='學號',how='inner',suffixes=('_C','_C++'))
java_class2.rename(columns={'總分':'總分_JAVA'},inplace=True)
major_class = pd.merge(k,java_class2[['學號','總分_JAVA']],on='學號',how='inner')
# 對總分進行標準化
major_class['總分_C'] = (major_class['總分_C']-major_class['總分_C'].mean())/major_class['總分_C'].std()
major_class['總分_C++'] = (major_class['總分_C++']-major_class['總分_C++'].mean())/major_class['總分_C++'].std()
major_class['總分_JAVA'] = (major_class['總分_JAVA']-major_class['總分_JAVA'].mean())/major_class['總分_JAVA'].std()

X = major_class.loc[:,('總分_C','總分_C++')]
y = major_class.loc[:,('總分_JAVA')]
# 訓練集測試集劃分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=10)
y_test

# KNN
from sklearn.neighbors import KNeighborsRegressor 
model_knn = KNeighborsRegressor()
model_knn.fit(X_train, y_train)
pre_knn = model_knn.predict(X_test)
from sklearn.metrics import mean_squared_error
mse_knn = mean_squared_error(pre_knn, y_test)
print(mse_knn)

## 神經網路
from sklearn.neural_network import MLPRegressor

model_NN = MLPRegressor(solver='lbfgs', hidden_layer_sizes=(10,10), random_state=100)
model_NN.fit(X_train, y_train)
pre_NN = model_NN.predict(X_test)
mse_NN = mean_squared_error(pre_NN, y_test)
print(mse_NN)

## 神經網路
from sklearn.neural_network import MLPRegressor
model_NN = MLPRegressor(solver='lbfgs', hidden_layer_sizes=(20,20), random_state=100)
model_NN.fit(X_train, y_train)
pre_NN = model_NN.predict(X_test)
mse_NN = mean_squared_error(pre_NN, y_test)
print(mse_NN)

## 神經網路
from sklearn.neural_network import MLPRegressor
### 建立模型
model_NN = MLPRegressor(solver='lbfgs', hidden_layer_sizes=(20,20,20), random_state=100)
model_NN.fit(X_train, y_train)
pre_NN = model_NN.predict(X_test)
mse_NN = mean_squared_error(pre_NN, y_test)
print(mse_NN)

## 神經網路
from sklearn.neural_network import MLPRegressor
### 建立模型
model_NN = MLPRegressor(solver='lbfgs', hidden_layer_sizes=(5,5), random_state=100)
### 訓練模型
model_NN.fit(X_train, y_train)
### 在測試集上做預測
pre_NN = model_NN.predict(X_test)
## 評估模型
### 計算均方誤差
mse_NN = mean_squared_error(pre_NN, y_test)
print(mse_NN)

相關文章