import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.font_manager import FontProperties
from sklearn import datasets
font = FontProperties(fname=r"C:\Windows\Fonts\simhei.ttf", size=14)
iris =datasets.load_iris()
print(iris)
x = iris.data
print('x個數:{}'.format(len(x)))
print('x:{}'.format(x[0:5]))
y= iris.target
print('y個數:{}'.format(len(y)))
print('y:{}'.format(y))
視覺化
x_ = x[:,[0,1]]
plt.scatter(x_[:50,0],x_[:50,1],color = 'r',label='山鳶尾',s = 10)
plt.scatter(x_[50:100,0],x_[50:100,1],color = 'g',label='雜色鳶尾',s = 10)
plt.scatter(x_[100:150,0],x_[100:150,1],color = 'b',label='維吉尼亞鳶尾',s = 10)
plt.rcParams['font.sans-serif']=['simHei']
plt.xlabel('萼片長度')
plt.ylabel('萼片寬度')
plt.title('鳶尾花資料')
plt.legend()
plt.show()
資料切割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_,y,test_size=1/3)
print('訓練集個數:{},測試集個數'.format(len(y_train)),'測試集個數:{}'.format(len(y_test)))
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
clf = SVC(kernel='linear',probability=True)
訓練資料
clf.fit(x_train,y_train)
預測資料分類結果
y_prd = clf.predict(x_test)
print(y_prd)
print(y_prd - y_test)
獲取引數
clf.get_params()
clf.C
看機率
clf.predict_proba(x_test)[0:5,:]
看準確率
clf.score(x_test,y_test)
測試模型
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf,x_,y,cv=10)
print('準確率:{:.4f}(+/-{:.4f})'.format(scores.mean(),scores.std()*2))
最佳化模型
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC()
param_grid = [{'C':[0.1,1,10,20],'kernel':['linear']},
{'C':[0.1,1,10,20],'kernel':['rbf'],'gamma':[0.1,1,10,20]}]
scoring = 'accuracy'
clf = GridSearchCV(estimator=svc,param_grid=param_grid,scoring=scoring,cv=10)
clf = clf.fit(x,y)
print( clf.predict(x))
檢視最優引數
print(clf.best_params_)
print(clf.best_score_)
持久化模組
import pickle
序列化
pkl_str = pickle.dumps(clf)
print(pkl_str[:100])
反序列化
clf2 = pickle.loads(pkl_str)
print(clf2.predict(x))