利用Scikit-learn機器學習庫的特徵分類進行vnpy期貨量化交易(程式碼)
程式碼如下。我也放在我的GitHub裡面。
已經加了註釋,執行時候會有warning 資訊。其他可以看那邊框架解釋文章。
# encoding: UTF-8 import warnings warnings.filterwarnings("ignore") from pymongo import MongoClient, ASCENDING import pandas as pd import numpy as np from datetime import datetime import talib import matplotlib.pyplot as plt import scipy.stats as st from sklearn.model_selection import train_test_split # LogisticRegression 邏輯迴歸 from sklearn.linear_model import LogisticRegression # DecisionTreeClassifier 決策樹 from sklearn.tree import DecisionTreeClassifier # SVC 支援向量分類 from sklearn.svm import SVC # MLP 神經網路 from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV class DataAnalyzerforSklearn(object): """ 這個類是為了SVM做歸納分析資料,以未來6個bar的斜率線性迴歸為判斷分類是否正確。 不是直接分析HLOC,而且用下列分非線性引數(就是和具體點位無關) 1.Percentage 2.std 4.MACD 5.CCI 6.ATR 7. 該bar之前的均線斜率 8. RSI """ def __init__(self, exportpath="C:\\Project\\", datformat=['datetime', 'high', 'low', 'open', 'close','volume']): self.mongohost = None self.mongoport = None self.db = None self.collection = None self.df = pd.DataFrame() self.exportpath = exportpath self.datformat = datformat self.startBar = 2 self.endBar = 12 self.step = 2 self.pValue = 0.015 #-----------------------------------------匯入資料------------------------------------------------- def db2df(self, db, collection, start, end, mongohost="localhost", mongoport=27017, export2csv=False): """讀取MongoDB資料庫行情記錄,輸出到Dataframe中""" self.mongohost = mongohost self.mongoport = mongoport self.db = db self.collection = collection dbClient = MongoClient(self.mongohost, self.mongoport, connectTimeoutMS=500) db = dbClient[self.db] cursor = db[self.collection].find({'datetime':{'$gte':start, '$lt':end}}).sort("datetime",ASCENDING) self.df = pd.DataFrame(list(cursor)) self.df = self.df[self.datformat] self.df = self.df.reset_index(drop=True) path = self.exportpath + self.collection + ".csv" if export2csv == True: self.df.to_csv(path, index=True, header=True) return self.df def csv2df(self, csvpath, dataname="csv_data", export2csv=False): """讀取csv行情資料,輸入到Dataframe中""" csv_df = pd.read_csv(csvpath) self.df = csv_df[self.datformat] self.df["datetime"] = pd.to_datetime(self.df['datetime']) self.df = self.df.reset_index(drop=True) path = self.exportpath + dataname + ".csv" if export2csv == True: self.df.to_csv(path, index=True, header=True) return self.df def df2Barmin(self, inputdf, barmins, crossmin=1, export2csv=False): """輸入分鐘k線dataframe資料,合併多多種資料,例如三分鐘/5分鐘等,如果開始時間是9點1分,crossmin = 0;如果是9點0分,crossmin為1""" dfbarmin = pd.DataFrame() highBarMin = 0 lowBarMin = 0 openBarMin = 0 volumeBarmin = 0 datetime = 0 for i in range(0, len(inputdf) - 1): bar = inputdf.iloc[i, :].to_dict() if openBarMin == 0: openBarmin = bar["open"] if highBarMin == 0: highBarMin = bar["high"] else: highBarMin = max(bar["high"], highBarMin) if lowBarMin == 0: lowBarMin = bar["low"] else: lowBarMin = min(bar["low"], lowBarMin) closeBarMin = bar["close"] datetime = bar["datetime"] volumeBarmin += int(bar["volume"]) # X分鐘已經走完 if not (bar["datetime"].minute + crossmin) % barmins: # 可以用X整除 # 生成上一X分鐘K線的時間戳 barMin = {'datetime': datetime, 'high': highBarMin, 'low': lowBarMin, 'open': openBarmin, 'close': closeBarMin, 'volume' : volumeBarmin} dfbarmin = dfbarmin.append(barMin, ignore_index=True) highBarMin = 0 lowBarMin = 0 openBarMin = 0 volumeBarmin = 0 if export2csv == True: dfbarmin.to_csv(self.exportpath + "bar" + str(barmins)+ str(self.collection) + ".csv", index=True, header=True) return dfbarmin #-----------------------------------------開始計算指標------------------------------------------------- def dfcci(self, inputdf, n, export2csv=True): """呼叫talib方法計算CCI指標,寫入到df並輸出""" dfcci = inputdf dfcci["cci"] = None for i in range(n, len(inputdf)): df_ne = inputdf.loc[i - n + 1:i, :] cci = talib.CCI(np.array(df_ne["high"]), np.array(df_ne["low"]), np.array(df_ne["close"]), n) dfcci.loc[i, "cci"] = cci[-1] dfcci = dfcci.fillna(0) dfcci = dfcci.replace(np.inf, 0) if export2csv == True: dfcci.to_csv(self.exportpath + "dfcci" + str(self.collection) + ".csv", index=True, header=True) return dfcci def dfatr(self, inputdf, n, export2csv=True): """呼叫talib方法計算ATR指標,寫入到df並輸出""" dfatr = inputdf for i in range((n+1), len(inputdf)): df_ne = inputdf.loc[i - n :i, :] atr = talib.ATR(np.array(df_ne["high"]), np.array(df_ne["low"]), np.array(df_ne["close"]), n) dfatr.loc[i, "atr"] = atr[-1] dfatr = dfatr.fillna(0) dfatr = dfatr.replace(np.inf, 0) if export2csv == True: dfatr.to_csv(self.exportpath + "dfatr" + str(self.collection) + ".csv", index=True, header=True) return dfatr def dfrsi(self, inputdf, n, export2csv=True): """呼叫talib方法計算ATR指標,寫入到df並輸出""" dfrsi = inputdf dfrsi["rsi"] = None for i in range(n+1, len(inputdf)): df_ne = inputdf.loc[i - n :i, :] rsi = talib.RSI(np.array(df_ne["close"]), n) dfrsi.loc[i, "rsi"] = rsi[-1] dfrsi = dfrsi.fillna(0) dfrsi = dfrsi.replace(np.inf, 0) if export2csv == True: dfrsi.to_csv(self.exportpath + "dfrsi" + str(self.collection) + ".csv", index=True, header=True) return dfrsi def Percentage(self, inputdf, export2csv=True): """呼叫talib方法計算CCI指標,寫入到df並輸出""" dfPercentage = inputdf # dfPercentage["Percentage"] = None for i in range(1, len(inputdf)): # if dfPercentage.loc[i,"close"]>dfPercentage.loc[i,"open"]: # percentage = ((dfPercentage.loc[i,"high"] - dfPercentage.loc[i-1,"close"])/ dfPercentage.loc[i-1,"close"])*100 # else: # percentage = (( dfPercentage.loc[i,"low"] - dfPercentage.loc[i-1,"close"] )/ dfPercentage.loc[i-1,"close"])*100 if dfPercentage.loc[ i - 1, "close"] == 0.0: percentage = 0 else: percentage = ((dfPercentage.loc[i, "close"] - dfPercentage.loc[i - 1, "close"]) / dfPercentage.loc[ i - 1, "close"]) * 100.0 dfPercentage.loc[i, "Perentage"] = percentage dfPercentage = dfPercentage.fillna(0) dfPercentage = dfPercentage.replace(np.inf, 0) if export2csv == True: dfPercentage.to_csv(self.exportpath + "Percentage_" + str(self.collection) + ".csv", index=True, header=True) return dfPercentage def dfMACD(self, inputdf, n, export2csv=False): """呼叫talib方法計算MACD指標,寫入到df並輸出""" dfMACD = inputdf for i in range(n, len(inputdf)): df_ne = inputdf.loc[i - n + 1:i, :] macd,signal,hist = talib.MACD(np.array(df_ne["close"]),12,26,9) dfMACD.loc[i, "macd"] = macd[-1] dfMACD.loc[i, "signal"] = signal[-1] dfMACD.loc[i, "hist"] = hist[-1] dfMACD = dfMACD.fillna(0) dfMACD = dfMACD.replace(np.inf, 0) if export2csv == True: dfMACD.to_csv(self.exportpath + "macd" + str(self.collection) + ".csv", index=True, header=True) return dfMACD def dfSTD(self, inputdf, n, export2csv=False): """呼叫talib方法計算MACD指標,寫入到df並輸出""" dfSTD = inputdf for i in range(n, len(inputdf)): df_ne = inputdf.loc[i - n + 1:i, :] std = talib.STDDEV(np.array(df_ne["close"]),n) dfSTD.loc[i, "std"] = std[-1] dfSTD = dfSTD.fillna(0) dfSTD = dfSTD.replace(np.inf, 0) if export2csv == True: dfSTD.to_csv(self.exportpath + "dfSTD" + str(self.collection) + ".csv", index=True, header=True) return dfSTD #-----------------------------------------加入趨勢分類------------------------------------------------- def addTrend(self, inputdf, trendsetp=6, export2csv=False): """以未來6個bar的斜率線性迴歸為判斷分類是否正確""" dfTrend = inputdf for i in range(1, len(dfTrend) - trendsetp-1): histRe = np.array(dfTrend["close"])[i:i+trendsetp] xAixs = np.arange(trendsetp) + 1 res = st.linregress(y=histRe, x=xAixs) if res.pvalue < self.pValue+0.01: if res.slope > 0.5: dfTrend.loc[i,"tradeindictor"] = 1 elif res.slope < -0.5: dfTrend.loc[i, "tradeindictor"] = -1 dfTrend = dfTrend.fillna(0) dfTrend = dfTrend.replace(np.inf, 0) if export2csv == True: dfTrend.to_csv(self.exportpath + "addTrend" + str(self.collection) + ".csv", index=True, header=True) return dfTrend def GirdValuate(X_train, y_train): """1)LogisticRegression 邏輯迴歸 2)DecisionTreeClassifier 決策樹 3)SVC 支援向量分類 4)MLP 神經網路""" clf_DT=DecisionTreeClassifier() param_grid_DT= {'max_depth': [1,2,3,4,5,6]} clf_Logit=LogisticRegression() param_grid_logit= {'solver': ['liblinear','lbfgs','newton-cg','sag']} clf_svc=SVC() param_grid_svc={'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[1, 2, 4], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]} clf_mlp = MLPClassifier() param_grid_mlp= {"hidden_layer_sizes": [(100,), (100, 30)], "solver": ['adam', 'sgd', 'lbfgs'], "max_iter": [20], "verbose": [False] } #打包引數集合 clf=[clf_DT,clf_Logit,clf_mlp,clf_svc] param_grid=[param_grid_DT,param_grid_logit,param_grid_mlp,param_grid_svc] from sklearn.model_selection import StratifiedKFold # 交叉驗證 kflod = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) # 將訓練/測試資料集劃分10個互斥子集,這樣方便多程式測試 #網格測試 for i in range(0,4): grid=GridSearchCV(clf[i], param_grid[i], scoring='accuracy',n_jobs = -1,cv = kflod) grid.fit(X_train, y_train) print (grid.best_params_,': ',grid.best_score_) if __name__ == '__main__': # 讀取資料 # exportpath = "C:\\Users\shui0\OneDrive\Documents\Project\\" exportpath = "C:\Project\\" DA = DataAnalyzerforSklearn(exportpath) #資料庫匯入 start = datetime.strptime("20180501", '%Y%m%d') end = datetime.strptime("20190501", '%Y%m%d') df = DA.db2df(db="VnTrader_1Min_Db", collection="rb8888", start = start, end = end) df5min = DA.df2Barmin(df, 5) df5minAdd = DA.addTrend(df5min, export2csv=True) df5minAdd = DA.dfMACD(df5minAdd, n=34, export2csv=True) df5minAdd = DA.dfatr(df5minAdd, n=25, export2csv=True) df5minAdd = DA.dfrsi(df5minAdd, n=35, export2csv=True) df5minAdd = DA.dfcci(df5minAdd,n = 30,export2csv=True) df5minAdd = DA.dfSTD(df5minAdd, n=30, export2csv=True) df5minAdd = DA.Percentage(df5minAdd,export2csv = True) #劃分測試驗證。 df_test = df5minAdd.loc[60:,:] #只從第60個開始分析,因為之前很多是空值 y= np.array(df_test["tradeindictor"]) #只保留結果趨勢結果,轉化為陣列 X = df_test.drop(["tradeindictor","close","datetime","high","low","open","volume"],axis = 1).values #不是直接分析HLOC,只保留特徵值,轉化為陣列 X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0) #三七 print("訓練集長度: %s, 測試集長度: %s" %(len(X_train),len(X_test))) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import mutual_info_classif #特徵工作,可以按照百分比選出最高分特徵類,取最優70%,也可以用SelectKBest,指定要幾個特徵類。 print(X_train.shape) selectPer = SelectPercentile(mutual_info_classif, percentile=70) # selectPer = SelectKBest(mutual_info_classif, k=7) X_train = selectPer.fit_transform(X_train, y_train) print(X_train.shape) X_test = selectPer.transform(X_test) # 也可以用Fpr選擇 # selectFea=SelectFpr(alpha=0.01) # X_train_new = selectFea.fit_transform(X_train, y_train) # X_test_new = selectFea.transform(X_test) # 這裡使用下面模式進行分析,然後利用網格調參 GirdValuate(X_train,y_train) # 使用選取最好的模型,進行測試看看拼接 # • 模型預測:model.predict() # • Accuracy:metrics.accuracy_score() # • Presicion:metrics.precision_score() # • Recall:metrics.recall_score() from sklearn import metrics clf_selected=MLPClassifier(hidden_layer_sizes=(100,30), max_iter=20, solver='adam') #此處填入網格回測最優模型和引數, # {'hidden_layer_sizes': (100, 30), 'max_iter': 20, 'solver': 'adam', 'verbose': False} : 0.9897016507648039 clf_selected.fit(X_train, y_train) y_pred = clf_selected.predict(X_test) #accuracy accuracy=metrics.accuracy_score(y_true=y_test, y_pred=y_pred) print ('accuracy:',accuracy) #precision precision=metrics.precision_score(y_true=y_test, y_pred=y_pred,average="micro") print ('precision:',precision) #recall recall=metrics.recall_score(y_true=y_test, y_pred=y_pred,average="micro") print ('recall:',recall) #實際值和預測值 print (y_test) print (y_pred) dfresult = pd.DataFrame({'Actual':y_test,'Predict':y_pred}) dfresult.to_csv(exportpath + "result" + ".csv", index=True, header=True) from sklearn.externals import joblib #模型儲存到本地 joblib.dump(clf_selected,'clf_selected.m') #模型的恢復 clf_tmp=joblib.load('clf_selected.m')
執行結果:
訓練集長度: 11673, 測試集長度: 5003
(11673, 8)
(11673, 5)
('accuracy:', '0.7833300019988008')
('precision:', '0.7833300019988008')
('recall:', '0.7833300019988008')
[ 1. 0. 0. ... 0. 0. -1.]
[0. 0. 0. ... 0. 0. 0.]
在vnpy中使用,簡單說下, 在策略init 方法中使用 clf_tmp=joblib.load( 'clf_selected.m' ) 讀取模型,然後在onXminBar方法中,
使用ArrayManager計算那些特徵值,使用 clf_selected.predict()計算中預測分類,如果1開多單,-1空單,0略過。
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/22259926/viewspace-2648828/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 利用Scikit-learn機器學習庫的特徵分類進行vnpy期貨量化交易機器學習特徵
- 利用機器學習進行惡意程式碼分類機器學習
- 機器學習二——利用numpy庫對矩陣進行操作機器學習矩陣
- [乾貨]如何從不均衡類中進行機器學習機器學習
- 【機器學習】--xgboost初始之程式碼實現分類機器學習
- 入門系列之Scikit-learn在Python中構建機器學習分類器Python機器學習
- 用Python進行機器學習(附程式碼、學習資源)Python機器學習
- 合約現貨量化交易開發系統原始碼|量化交易機器人對沖策略原始碼機器人
- 使用scikit-learn機器學習庫裡面的xgboost機器學習
- 數字貨幣/期貨量化交易系統開發(交易演算法)| 量化交易系統開發原始碼示例演算法原始碼
- 機器學習之特徵組合: 多非線性規律進行編碼機器學習特徵
- VNPY 單品種期貨的網格交易策略的實現
- AI和機器學習對量化交易領域的影響AI機器學習
- 機器學習--分類變數編碼方法機器學習變數
- 人工智慧-機器學習-Python-第三方庫-scikit-learn(用於特徵工程)人工智慧機器學習Python特徵工程
- Scikit-learn 機器學習庫介紹!【Python入門】機器學習Python
- python中的scikit-learn庫來實現SVM分類器。Python
- VNPY 價差交易模組的使用學習
- 機器人想要什麼:利用機器學習有效地進行教學機器人機器學習
- [機器學習] 低程式碼機器學習工具PyCaret庫使用指北機器學習
- 【機器學習】支援向量機分類機器學習
- 現貨合約量化交易系統開發|量化機器人開發原始碼案例機器人原始碼
- 機器學習常用的分類器比較機器學習
- 機器學習-特徵提取機器學習特徵
- 機器學習 | 特徵工程機器學習特徵工程
- 機器學習——特徵工程機器學習特徵工程
- 機器學習特徵工程機器學習特徵工程
- 機器學習-無監督學習(人臉識別,使用NMF進行特徵提取)機器學習特徵
- 現貨量化交易機器人開發穩定版丨現貨量化交易機器人系統開發(成熟及方案)機器人
- 針對量化交易的機器學習中,交易策略傾向於空倉的問題機器學習
- 經典量化策略——做市商交易(期貨)
- 如何使用Python、Transformers和scikit-learn對文字進行分類?PythonORM
- 機器學習的靜態特徵和動態特徵機器學習特徵
- 量化交易機器人開發程式碼丨量化機器人功能系統開發(python多語言)機器人Python
- 機器學習之特徵工程機器學習特徵工程
- 合約量化交易開發丨量化交易AI機器人系統開發與技術程式碼示例AI機器人
- 利用sklearn進行字典&文字的特徵提取特徵
- 期貨量化合約交易系統開發多少錢一套?(期貨量化合約交易系統開發流程)