利用Scikit-learn機器學習庫的特徵分類進行vnpy期貨量化交易(程式碼)
程式碼如下。我也放在我的GitHub裡面。
已經加了註釋,執行時候會有warning 資訊。其他可以看那邊框架解釋文章。
# encoding: UTF-8 import warnings warnings.filterwarnings("ignore") from pymongo import MongoClient, ASCENDING import pandas as pd import numpy as np from datetime import datetime import talib import matplotlib.pyplot as plt import scipy.stats as st from sklearn.model_selection import train_test_split # LogisticRegression 邏輯迴歸 from sklearn.linear_model import LogisticRegression # DecisionTreeClassifier 決策樹 from sklearn.tree import DecisionTreeClassifier # SVC 支援向量分類 from sklearn.svm import SVC # MLP 神經網路 from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV class DataAnalyzerforSklearn(object): """ 這個類是為了SVM做歸納分析資料,以未來6個bar的斜率線性迴歸為判斷分類是否正確。 不是直接分析HLOC,而且用下列分非線性引數(就是和具體點位無關) 1.Percentage 2.std 4.MACD 5.CCI 6.ATR 7. 該bar之前的均線斜率 8. RSI """ def __init__(self, exportpath="C:\\Project\\", datformat=['datetime', 'high', 'low', 'open', 'close','volume']): self.mongohost = None self.mongoport = None self.db = None self.collection = None self.df = pd.DataFrame() self.exportpath = exportpath self.datformat = datformat self.startBar = 2 self.endBar = 12 self.step = 2 self.pValue = 0.015 #-----------------------------------------匯入資料------------------------------------------------- def db2df(self, db, collection, start, end, mongohost="localhost", mongoport=27017, export2csv=False): """讀取MongoDB資料庫行情記錄,輸出到Dataframe中""" self.mongohost = mongohost self.mongoport = mongoport self.db = db self.collection = collection dbClient = MongoClient(self.mongohost, self.mongoport, connectTimeoutMS=500) db = dbClient[self.db] cursor = db[self.collection].find({'datetime':{'$gte':start, '$lt':end}}).sort("datetime",ASCENDING) self.df = pd.DataFrame(list(cursor)) self.df = self.df[self.datformat] self.df = self.df.reset_index(drop=True) path = self.exportpath + self.collection + ".csv" if export2csv == True: self.df.to_csv(path, index=True, header=True) return self.df def csv2df(self, csvpath, dataname="csv_data", export2csv=False): """讀取csv行情資料,輸入到Dataframe中""" csv_df = pd.read_csv(csvpath) self.df = csv_df[self.datformat] self.df["datetime"] = pd.to_datetime(self.df['datetime']) self.df = self.df.reset_index(drop=True) path = self.exportpath + dataname + ".csv" if export2csv == True: self.df.to_csv(path, index=True, header=True) return self.df def df2Barmin(self, inputdf, barmins, crossmin=1, export2csv=False): """輸入分鐘k線dataframe資料,合併多多種資料,例如三分鐘/5分鐘等,如果開始時間是9點1分,crossmin = 0;如果是9點0分,crossmin為1""" dfbarmin = pd.DataFrame() highBarMin = 0 lowBarMin = 0 openBarMin = 0 volumeBarmin = 0 datetime = 0 for i in range(0, len(inputdf) - 1): bar = inputdf.iloc[i, :].to_dict() if openBarMin == 0: openBarmin = bar["open"] if highBarMin == 0: highBarMin = bar["high"] else: highBarMin = max(bar["high"], highBarMin) if lowBarMin == 0: lowBarMin = bar["low"] else: lowBarMin = min(bar["low"], lowBarMin) closeBarMin = bar["close"] datetime = bar["datetime"] volumeBarmin += int(bar["volume"]) # X分鐘已經走完 if not (bar["datetime"].minute + crossmin) % barmins: # 可以用X整除 # 生成上一X分鐘K線的時間戳 barMin = {'datetime': datetime, 'high': highBarMin, 'low': lowBarMin, 'open': openBarmin, 'close': closeBarMin, 'volume' : volumeBarmin} dfbarmin = dfbarmin.append(barMin, ignore_index=True) highBarMin = 0 lowBarMin = 0 openBarMin = 0 volumeBarmin = 0 if export2csv == True: dfbarmin.to_csv(self.exportpath + "bar" + str(barmins)+ str(self.collection) + ".csv", index=True, header=True) return dfbarmin #-----------------------------------------開始計算指標------------------------------------------------- def dfcci(self, inputdf, n, export2csv=True): """呼叫talib方法計算CCI指標,寫入到df並輸出""" dfcci = inputdf dfcci["cci"] = None for i in range(n, len(inputdf)): df_ne = inputdf.loc[i - n + 1:i, :] cci = talib.CCI(np.array(df_ne["high"]), np.array(df_ne["low"]), np.array(df_ne["close"]), n) dfcci.loc[i, "cci"] = cci[-1] dfcci = dfcci.fillna(0) dfcci = dfcci.replace(np.inf, 0) if export2csv == True: dfcci.to_csv(self.exportpath + "dfcci" + str(self.collection) + ".csv", index=True, header=True) return dfcci def dfatr(self, inputdf, n, export2csv=True): """呼叫talib方法計算ATR指標,寫入到df並輸出""" dfatr = inputdf for i in range((n+1), len(inputdf)): df_ne = inputdf.loc[i - n :i, :] atr = talib.ATR(np.array(df_ne["high"]), np.array(df_ne["low"]), np.array(df_ne["close"]), n) dfatr.loc[i, "atr"] = atr[-1] dfatr = dfatr.fillna(0) dfatr = dfatr.replace(np.inf, 0) if export2csv == True: dfatr.to_csv(self.exportpath + "dfatr" + str(self.collection) + ".csv", index=True, header=True) return dfatr def dfrsi(self, inputdf, n, export2csv=True): """呼叫talib方法計算ATR指標,寫入到df並輸出""" dfrsi = inputdf dfrsi["rsi"] = None for i in range(n+1, len(inputdf)): df_ne = inputdf.loc[i - n :i, :] rsi = talib.RSI(np.array(df_ne["close"]), n) dfrsi.loc[i, "rsi"] = rsi[-1] dfrsi = dfrsi.fillna(0) dfrsi = dfrsi.replace(np.inf, 0) if export2csv == True: dfrsi.to_csv(self.exportpath + "dfrsi" + str(self.collection) + ".csv", index=True, header=True) return dfrsi def Percentage(self, inputdf, export2csv=True): """呼叫talib方法計算CCI指標,寫入到df並輸出""" dfPercentage = inputdf # dfPercentage["Percentage"] = None for i in range(1, len(inputdf)): # if dfPercentage.loc[i,"close"]>dfPercentage.loc[i,"open"]: # percentage = ((dfPercentage.loc[i,"high"] - dfPercentage.loc[i-1,"close"])/ dfPercentage.loc[i-1,"close"])*100 # else: # percentage = (( dfPercentage.loc[i,"low"] - dfPercentage.loc[i-1,"close"] )/ dfPercentage.loc[i-1,"close"])*100 if dfPercentage.loc[ i - 1, "close"] == 0.0: percentage = 0 else: percentage = ((dfPercentage.loc[i, "close"] - dfPercentage.loc[i - 1, "close"]) / dfPercentage.loc[ i - 1, "close"]) * 100.0 dfPercentage.loc[i, "Perentage"] = percentage dfPercentage = dfPercentage.fillna(0) dfPercentage = dfPercentage.replace(np.inf, 0) if export2csv == True: dfPercentage.to_csv(self.exportpath + "Percentage_" + str(self.collection) + ".csv", index=True, header=True) return dfPercentage def dfMACD(self, inputdf, n, export2csv=False): """呼叫talib方法計算MACD指標,寫入到df並輸出""" dfMACD = inputdf for i in range(n, len(inputdf)): df_ne = inputdf.loc[i - n + 1:i, :] macd,signal,hist = talib.MACD(np.array(df_ne["close"]),12,26,9) dfMACD.loc[i, "macd"] = macd[-1] dfMACD.loc[i, "signal"] = signal[-1] dfMACD.loc[i, "hist"] = hist[-1] dfMACD = dfMACD.fillna(0) dfMACD = dfMACD.replace(np.inf, 0) if export2csv == True: dfMACD.to_csv(self.exportpath + "macd" + str(self.collection) + ".csv", index=True, header=True) return dfMACD def dfSTD(self, inputdf, n, export2csv=False): """呼叫talib方法計算MACD指標,寫入到df並輸出""" dfSTD = inputdf for i in range(n, len(inputdf)): df_ne = inputdf.loc[i - n + 1:i, :] std = talib.STDDEV(np.array(df_ne["close"]),n) dfSTD.loc[i, "std"] = std[-1] dfSTD = dfSTD.fillna(0) dfSTD = dfSTD.replace(np.inf, 0) if export2csv == True: dfSTD.to_csv(self.exportpath + "dfSTD" + str(self.collection) + ".csv", index=True, header=True) return dfSTD #-----------------------------------------加入趨勢分類------------------------------------------------- def addTrend(self, inputdf, trendsetp=6, export2csv=False): """以未來6個bar的斜率線性迴歸為判斷分類是否正確""" dfTrend = inputdf for i in range(1, len(dfTrend) - trendsetp-1): histRe = np.array(dfTrend["close"])[i:i+trendsetp] xAixs = np.arange(trendsetp) + 1 res = st.linregress(y=histRe, x=xAixs) if res.pvalue < self.pValue+0.01: if res.slope > 0.5: dfTrend.loc[i,"tradeindictor"] = 1 elif res.slope < -0.5: dfTrend.loc[i, "tradeindictor"] = -1 dfTrend = dfTrend.fillna(0) dfTrend = dfTrend.replace(np.inf, 0) if export2csv == True: dfTrend.to_csv(self.exportpath + "addTrend" + str(self.collection) + ".csv", index=True, header=True) return dfTrend def GirdValuate(X_train, y_train): """1)LogisticRegression 邏輯迴歸 2)DecisionTreeClassifier 決策樹 3)SVC 支援向量分類 4)MLP 神經網路""" clf_DT=DecisionTreeClassifier() param_grid_DT= {'max_depth': [1,2,3,4,5,6]} clf_Logit=LogisticRegression() param_grid_logit= {'solver': ['liblinear','lbfgs','newton-cg','sag']} clf_svc=SVC() param_grid_svc={'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[1, 2, 4], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]} clf_mlp = MLPClassifier() param_grid_mlp= {"hidden_layer_sizes": [(100,), (100, 30)], "solver": ['adam', 'sgd', 'lbfgs'], "max_iter": [20], "verbose": [False] } #打包引數集合 clf=[clf_DT,clf_Logit,clf_mlp,clf_svc] param_grid=[param_grid_DT,param_grid_logit,param_grid_mlp,param_grid_svc] from sklearn.model_selection import StratifiedKFold # 交叉驗證 kflod = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) # 將訓練/測試資料集劃分10個互斥子集,這樣方便多程式測試 #網格測試 for i in range(0,4): grid=GridSearchCV(clf[i], param_grid[i], scoring='accuracy',n_jobs = -1,cv = kflod) grid.fit(X_train, y_train) print (grid.best_params_,': ',grid.best_score_) if __name__ == '__main__': # 讀取資料 # exportpath = "C:\\Users\shui0\OneDrive\Documents\Project\\" exportpath = "C:\Project\\" DA = DataAnalyzerforSklearn(exportpath) #資料庫匯入 start = datetime.strptime("20180501", '%Y%m%d') end = datetime.strptime("20190501", '%Y%m%d') df = DA.db2df(db="VnTrader_1Min_Db", collection="rb8888", start = start, end = end) df5min = DA.df2Barmin(df, 5) df5minAdd = DA.addTrend(df5min, export2csv=True) df5minAdd = DA.dfMACD(df5minAdd, n=34, export2csv=True) df5minAdd = DA.dfatr(df5minAdd, n=25, export2csv=True) df5minAdd = DA.dfrsi(df5minAdd, n=35, export2csv=True) df5minAdd = DA.dfcci(df5minAdd,n = 30,export2csv=True) df5minAdd = DA.dfSTD(df5minAdd, n=30, export2csv=True) df5minAdd = DA.Percentage(df5minAdd,export2csv = True) #劃分測試驗證。 df_test = df5minAdd.loc[60:,:] #只從第60個開始分析,因為之前很多是空值 y= np.array(df_test["tradeindictor"]) #只保留結果趨勢結果,轉化為陣列 X = df_test.drop(["tradeindictor","close","datetime","high","low","open","volume"],axis = 1).values #不是直接分析HLOC,只保留特徵值,轉化為陣列 X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0) #三七 print("訓練集長度: %s, 測試集長度: %s" %(len(X_train),len(X_test))) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import mutual_info_classif #特徵工作,可以按照百分比選出最高分特徵類,取最優70%,也可以用SelectKBest,指定要幾個特徵類。 print(X_train.shape) selectPer = SelectPercentile(mutual_info_classif, percentile=70) # selectPer = SelectKBest(mutual_info_classif, k=7) X_train = selectPer.fit_transform(X_train, y_train) print(X_train.shape) X_test = selectPer.transform(X_test) # 也可以用Fpr選擇 # selectFea=SelectFpr(alpha=0.01) # X_train_new = selectFea.fit_transform(X_train, y_train) # X_test_new = selectFea.transform(X_test) # 這裡使用下面模式進行分析,然後利用網格調參 GirdValuate(X_train,y_train) # 使用選取最好的模型,進行測試看看拼接 # • 模型預測:model.predict() # • Accuracy:metrics.accuracy_score() # • Presicion:metrics.precision_score() # • Recall:metrics.recall_score() from sklearn import metrics clf_selected=MLPClassifier(hidden_layer_sizes=(100,30), max_iter=20, solver='adam') #此處填入網格回測最優模型和引數, # {'hidden_layer_sizes': (100, 30), 'max_iter': 20, 'solver': 'adam', 'verbose': False} : 0.9897016507648039 clf_selected.fit(X_train, y_train) y_pred = clf_selected.predict(X_test) #accuracy accuracy=metrics.accuracy_score(y_true=y_test, y_pred=y_pred) print ('accuracy:',accuracy) #precision precision=metrics.precision_score(y_true=y_test, y_pred=y_pred,average="micro") print ('precision:',precision) #recall recall=metrics.recall_score(y_true=y_test, y_pred=y_pred,average="micro") print ('recall:',recall) #實際值和預測值 print (y_test) print (y_pred) dfresult = pd.DataFrame({'Actual':y_test,'Predict':y_pred}) dfresult.to_csv(exportpath + "result" + ".csv", index=True, header=True) from sklearn.externals import joblib #模型儲存到本地 joblib.dump(clf_selected,'clf_selected.m') #模型的恢復 clf_tmp=joblib.load('clf_selected.m')
執行結果:
訓練集長度: 11673, 測試集長度: 5003
(11673, 8)
(11673, 5)
('accuracy:', '0.7833300019988008')
('precision:', '0.7833300019988008')
('recall:', '0.7833300019988008')
[ 1. 0. 0. ... 0. 0. -1.]
[0. 0. 0. ... 0. 0. 0.]
在vnpy中使用,簡單說下, 在策略init 方法中使用 clf_tmp=joblib.load( 'clf_selected.m' ) 讀取模型,然後在onXminBar方法中,
使用ArrayManager計算那些特徵值,使用 clf_selected.predict()計算中預測分類,如果1開多單,-1空單,0略過。
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/22259926/viewspace-2648828/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 利用Scikit-learn機器學習庫的特徵分類進行vnpy期貨量化交易機器學習特徵
- 利用機器學習進行惡意程式碼分類機器學習
- 數字貨幣/期貨量化交易系統開發(交易演算法)| 量化交易系統開發原始碼示例演算法原始碼
- VNPY 單品種期貨的網格交易策略的實現
- 經典量化策略——做市商交易(期貨)
- 期貨量化合約交易系統開發多少錢一套?(期貨量化合約交易系統開發流程)
- 利用Hog特徵和SVM分類器進行行人檢測HOG特徵
- 數字貨幣量化交易平臺 數字貨幣量化交易平臺有哪些 雲度數字貨幣量化交易 什麼是量化交易 雲度量化介紹 數字貨幣市場的量化交易工具有哪些? 量化交易需要注意什麼?
- 利用sklearn進行字典&文字的特徵提取特徵
- 量化合約/合約量化/秒合約/永續合約/現貨期權期貨/交易所繫統開發案例及原始碼原始碼
- 量化現貨交易系統開發(功能詳解)| 量化現貨交易系統(原始碼demo示例)原始碼
- [乾貨]如何從不均衡類中進行機器學習機器學習
- 簡單程式碼:將回歸特徵轉換為分類特徵特徵
- 期貨量化交易模型系統開發優勢有哪些(原始碼demo示例)模型原始碼
- 利用聚寬(Joinquant)資料來源為vnpy新增期貨行情資料
- 機器學習二——利用numpy庫對矩陣進行操作機器學習矩陣
- 十行程式碼帶你量化交易入門行程
- 現貨期權期貨/合約量化/量化合約/秒合約/永續合約/交易所繫統開發成熟技術及原始碼原始碼
- 如何使用Python、Transformers和scikit-learn對文字進行分類?PythonORM
- 現貨策略跟單量化交易系統程式設計開發及程式碼示例(量化跟單)程式設計
- 合約現貨量化交易開發系統原始碼|量化交易機器人對沖策略原始碼機器人
- 數字貨幣期貨合約交易系統開發,自動對衝量化交易所開發
- 數字貨幣量化交易系統開發功能詳解丨量化交易開發原始碼模式原始碼模式
- 【機器學習PAI實踐十二】機器學習實現男女聲音識別分類(含語音特徵提取資料和程式碼)機器學習AI特徵
- VNPY,從傳送交易指令到交易所的原始碼分析原始碼
- “進化與適應”:除期貨交易外,高盛可能將涉足加密貨幣交易加密
- 龍哥量化:期貨日內波段策略,利用量化程式自動浮盈加倉,自動止盈止損。
- 機器學習之特徵組合: 多非線性規律進行編碼機器學習特徵
- 【量化交易】頂底分型策略
- AI和機器學習對量化交易領域的影響AI機器學習
- 【機器學習】--xgboost初始之程式碼實現分類機器學習
- python中的scikit-learn庫來實現SVM分類器。Python
- 量化交易系統開發需求丨量化交易原始碼模式原始碼模式
- VNPY 自帶跨時間週期交易策略MultiTimeframeStrategy 分析
- Collections工具類,可以使用collections工具類對程式碼中的list進行分組
- 入門系列之Scikit-learn在Python中構建機器學習分類器Python機器學習
- 機構投資者在華爾街首次進行比特幣期貨實物交易比特幣
- 人工智慧-機器學習-Python-第三方庫-scikit-learn(用於特徵工程)人工智慧機器學習Python特徵工程