通用機器學習演算法:線性迴歸+決策樹+Xgboost

coyan發表於2021-09-09
import os
import pandas as pd 
import numpy  as np 

def train_data_reads(path):  
	data_directory  = path + "/data"
	#獲取資料路徑
	data_name_list  = os.listdir(data_directory)
	file_name       = data_name_list[0]
	#資料的路徑:data_path
	data_path       = data_directory + "/" + file_name
	name,extension  = file_name.split(".")
	if extension == "csv":
		try:
			data = pd.read_csv(data_path,encoding = "gbk")
		except:
			data = pd.read_csv(data_path,encoding = "utf-8")
	elif extension == "txt":
		try:
			data = pd.read_csv(data_path,encoding = "gbk",sep = "t")
		except:
			data = pd.read_csv(data_path,encoding = "utf-8",sep = "t")
	else:
		data = pd.read_excel(data_path)

	return data 

def train_data_reprocess(data):

	#剔除重複值
	data = data.drop_duplicates()
	data = data.reset_index(drop = True)
	return data 

def feature_label_split(data):
	#獲取dataFrame的名
	name_list = data.columns.values.tolist()
	label_name = name_list[len(name_list) - 1]

	#將資料中label為空的資料刪除
	data = data[np.isnan(data[label_name]) == False]

	#拆分特徵與標籤
	x = data.drop(["ID",label_name],axis = 1)
	y = data[label_name]

	#補全特徵中的缺失值
	feature_name_list = x.columns.values.tolist()
	class_name_list   = [name for name in feature_name_list if name.find("class") > 0]
	num_name_list     = [name for name in feature_name_list if name.find("num")   > 0]
	class_filled_df   = x[class_name_list].fillna("missing")
	num_filled_df     = x[num_name_list].fillna(data.mean())
	new_x             = pd.concat([class_filled_df,num_filled_df],axis = 1)
	return new_x,y
	
#將分類特徵轉換成啞變數
def dummy_variable_transform(x):
	#獲取feature的列名
    columns_name = x.columns.values.tolist()
    for feature_name in columns_name:
        feature_name_split = feature_name.split("_", 1)
        name = feature_name_split[0]
        feature_type = feature_name_split[1]
        if feature_type == 'class':
            dummy_class = pd.get_dummies(x[feature_name], prefix=name, drop_first=True)
            x = x.drop(feature_name, axis=1).join(dummy_class)
    return x 

#對資料集X進行歸一化
#線性迴歸對最大值,最小值敏感,思考一下,標準化Or歸一化哪個更好
def data_normalization(x)
	from sklearn.preprocessing import MinMaxScaler
	scaler = MinMaxScaler(feature_range = (0,1))
	scaler.fit(x)
	data = scaler.transform(x)
	return data 

#劃分訓練集和測試集
def train_test_div(x,y,percent):
	from sklearn.model_selection import train_test_split
	x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = percent)
	return x_train,x_test,y_train,y_test
	#train_test_split:先打亂順序,然後進行分割


#1.線性迴歸預測
def lin_predict(x_train,x_test,y_train,y_test):
	from sklearn import linear_model
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error,r2_score
	linreg = LinearRegression()
	linreg.fit(x_train,y_train)

	y_pred = linreg.predict(x_test)
	y_pred = list(map(lambda x: x if x >= 0 else 0,y_pred))
	MSE = np.sqrt(sum((np.array(y_test) - np.array(y_pred)) ** 2 ) / len(y_pred) ) #均方根誤差作為結果
	R2  = r2_score(y_test,y_pred)
	return MSE,R2
	
#2.決策樹預測
#決策樹不需要變數變為啞變數
def tree_predict(x_train,x_test,y_train,y_test):
	from sklearn.tree import DecisionTreeRegressor
	reg = DecisionTreeRegressor(max_depth = 5)
	reg.fit(x_train,y_train)

	y_pred = reg.predict(x_test)
	y_pred = list(map(lambda x: x if x >= 0 else 0,y_pred))
	MSE = np.sqrt(sum((np.array(y_test) - np.array(y_pred)) ** 2 ) / len(y_pred) ) #均方根誤差作為結果
	R2  = r2_score(y_test,y_pred)
	return MSE,R2

#3.xgboost迴歸
#xgboost不需要變數變為啞變數
def xgb_predict(x_train,x_test,y_train,y_test):
	from xgboost import XGBRegressor
	reg = XGBRegressor(learning_rate = 0.05,max_depth = 5,n_estimators = 500)
	reg.fit(x_train,y_train)

	y_pred = reg.predict(x_test)
	y_pred = list(map(lambda x: x if x >= 0 else 0,y_pred))
	MSE = np.sqrt(sum((np.array(y_test) - np.array(y_pred)) ** 2 ) / len(y_pred) ) #均方根誤差作為結果
	R2  = r2_score(y_test,y_pred)
	return MSE,R2


def main():
	path = "E:/AnaLinReg/Data"
	data = train_data_reads(path)
	data = train_data_reprocess(data)
	x,y = feature_label_split(data)
	x = dummy_variable_transform(x)
	x = data_normalization(x)
	x_train,x_test,y_train,y_test = train_test_div(x3,y2,0.3)
    MSE,R2 = lin_predict(x_train,x_test,y_train,y_test)
    print (MSE)
    print (R2)


if __name__ == "__main__":
	main()

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/2236/viewspace-2822850/,如需轉載,請註明出處,否則將追究法律責任。