資料探勘——SVM(乳腺癌檢測)
文章目錄
import matplotlib
matplotlib.use('Qt4Agg')
# 乳腺癌診斷分類
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
# 載入資料集,你需要把資料放到目錄中
data = pd.read_csv("./data.csv")
# 資料探索
# 因為資料集中列比較多,我們需要把dataframe中的列全部顯示出來
pd.set_option('display.max_columns', None)
print(data.columns)
print(data.head(5))
print(data.describe())
# 將特徵欄位分成3組
features_mean= list(data.columns[2:12])
features_se= list(data.columns[12:22])
features_worst=list(data.columns[22:32])
# 資料清洗
# ID列沒有用,刪除該列
data.drop("id",axis=1,inplace=True)
# 將B良性替換為0,M惡性替換為1
data['diagnosis']=data['diagnosis'].map({'M':1,'B':0})
# 將腫瘤診斷結果視覺化
sns.countplot(data['diagnosis'],label="Count")
plt.show()
# 用熱力圖呈現features_mean欄位之間的相關性
corr = data[features_mean].corr()
plt.figure(figsize=(14,14))
# annot=True顯示每個方格的資料
sns.heatmap(corr, annot=True)
plt.show()
# 特徵選擇
features_remain = ['radius_mean','texture_mean', 'smoothness_mean','compactness_mean','symmetry_mean', 'fractal_dimension_mean']
# 抽取30%的資料作為測試集,其餘作為訓練集
train, test = train_test_split(data, test_size = 0.3)# in this our main data is splitted into train and test
# 抽取特徵選擇的數值作為訓練和測試資料
train_X = train[features_remain]
train_y=train['diagnosis']
test_X= test[features_remain]
test_y =test['diagnosis']
# 採用Z-Score規範化資料,保證每個特徵維度的資料均值為0,方差為1
ss = StandardScaler()
train_X = ss.fit_transform(train_X)
test_X = ss.transform(test_X)
# 建立SVM分類器
model = svm.SVC()
# 用訓練集做訓練
model.fit(train_X,train_y)
# 用測試集做預測
prediction=model.predict(test_X)
print('準確率: ', metrics.accuracy_score(prediction,test_y))
輸出:
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst'],
dtype='object')
id diagnosis radius_mean texture_mean perimeter_mean area_mean \
0 842302 M 17.99 10.38 122.80 1001.0
1 842517 M 20.57 17.77 132.90 1326.0
2 84300903 M 19.69 21.25 130.00 1203.0
3 84348301 M 11.42 20.38 77.58 386.1
4 84358402 M 20.29 14.34 135.10 1297.0
smoothness_mean compactness_mean concavity_mean concave points_mean \
0 0.11840 0.27760 0.3001 0.14710
1 0.08474 0.07864 0.0869 0.07017
2 0.10960 0.15990 0.1974 0.12790
3 0.14250 0.28390 0.2414 0.10520
4 0.10030 0.13280 0.1980 0.10430
symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se \
0 0.2419 0.07871 1.0950 0.9053 8.589
1 0.1812 0.05667 0.5435 0.7339 3.398
2 0.2069 0.05999 0.7456 0.7869 4.585
3 0.2597 0.09744 0.4956 1.1560 3.445
4 0.1809 0.05883 0.7572 0.7813 5.438
area_se smoothness_se compactness_se concavity_se concave points_se \
0 153.40 0.006399 0.04904 0.05373 0.01587
1 74.08 0.005225 0.01308 0.01860 0.01340
2 94.03 0.006150 0.04006 0.03832 0.02058
3 27.23 0.009110 0.07458 0.05661 0.01867
4 94.44 0.011490 0.02461 0.05688 0.01885
symmetry_se fractal_dimension_se radius_worst texture_worst \
0 0.03003 0.006193 25.38 17.33
1 0.01389 0.003532 24.99 23.41
2 0.02250 0.004571 23.57 25.53
3 0.05963 0.009208 14.91 26.50
4 0.01756 0.005115 22.54 16.67
perimeter_worst area_worst smoothness_worst compactness_worst \
0 184.60 2019.0 0.1622 0.6656
1 158.80 1956.0 0.1238 0.1866
2 152.50 1709.0 0.1444 0.4245
3 98.87 567.7 0.2098 0.8663
4 152.20 1575.0 0.1374 0.2050
concavity_worst concave points_worst symmetry_worst \
0 0.7119 0.2654 0.4601
1 0.2416 0.1860 0.2750
2 0.4504 0.2430 0.3613
3 0.6869 0.2575 0.6638
4 0.4000 0.1625 0.2364
fractal_dimension_worst
0 0.11890
1 0.08902
2 0.08758
3 0.17300
4 0.07678
id radius_mean texture_mean perimeter_mean area_mean \
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000
smoothness_mean compactness_mean concavity_mean concave points_mean \
count 569.000000 569.000000 569.000000 569.000000
mean 0.096360 0.104341 0.088799 0.048919
std 0.014064 0.052813 0.079720 0.038803
min 0.052630 0.019380 0.000000 0.000000
25% 0.086370 0.064920 0.029560 0.020310
50% 0.095870 0.092630 0.061540 0.033500
75% 0.105300 0.130400 0.130700 0.074000
max 0.163400 0.345400 0.426800 0.201200
symmetry_mean fractal_dimension_mean radius_se texture_se \
count 569.000000 569.000000 569.000000 569.000000
mean 0.181162 0.062798 0.405172 1.216853
std 0.027414 0.007060 0.277313 0.551648
min 0.106000 0.049960 0.111500 0.360200
25% 0.161900 0.057700 0.232400 0.833900
50% 0.179200 0.061540 0.324200 1.108000
75% 0.195700 0.066120 0.478900 1.474000
max 0.304000 0.097440 2.873000 4.885000
perimeter_se area_se smoothness_se compactness_se concavity_se \
count 569.000000 569.000000 569.000000 569.000000 569.000000
mean 2.866059 40.337079 0.007041 0.025478 0.031894
std 2.021855 45.491006 0.003003 0.017908 0.030186
min 0.757000 6.802000 0.001713 0.002252 0.000000
25% 1.606000 17.850000 0.005169 0.013080 0.015090
50% 2.287000 24.530000 0.006380 0.020450 0.025890
75% 3.357000 45.190000 0.008146 0.032450 0.042050
max 21.980000 542.200000 0.031130 0.135400 0.396000
concave points_se symmetry_se fractal_dimension_se radius_worst \
count 569.000000 569.000000 569.000000 569.000000
mean 0.011796 0.020542 0.003795 16.269190
std 0.006170 0.008266 0.002646 4.833242
min 0.000000 0.007882 0.000895 7.930000
25% 0.007638 0.015160 0.002248 13.010000
50% 0.010930 0.018730 0.003187 14.970000
75% 0.014710 0.023480 0.004558 18.790000
max 0.052790 0.078950 0.029840 36.040000
texture_worst perimeter_worst area_worst smoothness_worst \
count 569.000000 569.000000 569.000000 569.000000
mean 25.677223 107.261213 880.583128 0.132369
std 6.146258 33.602542 569.356993 0.022832
min 12.020000 50.410000 185.200000 0.071170
25% 21.080000 84.110000 515.300000 0.116600
50% 25.410000 97.660000 686.500000 0.131300
75% 29.720000 125.400000 1084.000000 0.146000
max 49.540000 251.200000 4254.000000 0.222600
compactness_worst concavity_worst concave points_worst \
count 569.000000 569.000000 569.000000
mean 0.254265 0.272188 0.114606
std 0.157336 0.208624 0.065732
min 0.027290 0.000000 0.000000
25% 0.147200 0.114500 0.064930
50% 0.211900 0.226700 0.099930
75% 0.339100 0.382900 0.161400
max 1.058000 1.252000 0.291000
symmetry_worst fractal_dimension_worst
count 569.000000 569.000000
mean 0.290076 0.083946
std 0.061867 0.018061
min 0.156500 0.055040
25% 0.250400 0.071460
50% 0.282200 0.080040
75% 0.317900 0.092080
max 0.663800 0.207500
準確率: 0.9415204678362573
支援向量機思維導圖:
相關文章
- 圖資料探勘:社群檢測演算法(一)演算法
- 資料探勘-預測模型彙總模型
- 【python資料探勘課程】二十七.基於SVM分類器的紅酒資料分析Python
- logminer進行資料探勘分析測試
- 資料探勘與預測分析(第2版)
- 資料探勘之產品預測任務
- 基於WOA-SVM的乳腺癌資料分類識別演算法matlab模擬,對比BP神經網路和SVM演算法Matlab神經網路
- 基於PSO-SVM的乳腺癌資料分類識別演算法matlab模擬,對比BP神經網路和SVM演算法Matlab神經網路
- 資料探勘( TO DO LIST)
- 資料探勘技術
- 資料探勘與生活
- 運用深度學習技術檢測轉移性乳腺癌深度學習
- 《資料探勘導論》實驗課——實驗四、資料探勘之KNN,Naive BayesKNNAI
- 資料探勘和資料提取能做什麼?
- 常用資料探勘演算法演算法
- 資料探勘-層次聚類聚類
- js檢測資料型別JS資料型別
- javascript 資料型別檢測JavaScript資料型別
- 淺談大資料、資料分析、資料探勘的區別!大資料
- 理解Transformer [資料探勘深度學習]ORM深度學習
- 資料探勘的步驟有哪些?
- 《資料探勘導論》讀後感
- 大資料探勘有哪些技術大資料
- 資料探勘技術功能有哪些
- 資料探勘的過程有哪些
- 資料探勘的辦法有哪些
- 資料探勘之 層次聚類聚類
- 資料探勘---BP神經網路神經網路
- 目標檢測資料集分析
- 資料分析與資料探勘 - 04科學計算
- 萌新向Python資料分析及資料探勘 前言Python
- Yahoo前任資料官:資料探勘與分析技巧(下)IF
- 資料探勘和資料提取該怎麼區分?
- 谷歌乳腺癌AI檢測新突破,誤診率遠低於人類醫生谷歌AI
- 大資料應用——資料探勘之推薦系統大資料
- 《資料分析與資料探勘》--天津大學公開課
- 大資料時代,如何做資料探勘與分析!大資料
- 資料探勘十大演算法演算法