缺失值處理
將一些0值不合理的列以列均值填充
# 預設值處理
features_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[features_with_zero] = data[features_with_zero].replace(0, np.nan)
# 使用均值填補NaN
data_filled = data.fillna(data.mean())
異常值處理
基於z-score統計進行異常值處理,閾值為3
from scipy import stats
threshold = 3
z_scores = np.abs(stats.zscore(data_filled))
filtered_entries = (z_scores < threshold).all(axis=1)
data_filled = data_filled[filtered_entries]
data_filled.describe()
相關性處理與按相關性序排列並完成視覺化
# 計算相關係數矩陣
corr_data = train_data
corr_data["Outcome"] = target
corr_matrix = corr_data.corr()
# 獲取與Outcome的相關係數
corr_outcome = corr_matrix['Outcome'].abs().sort_values(ascending=False)
corr_outcome = corr_outcome.drop('Outcome')
# 視覺化相關性排序
plt.rcdefaults()
plt.figure(figsize=(10, 6))
sns.barplot(x=corr_outcome.values, y=corr_outcome.index, palette='viridis')
plt.title('sort by corr with Outcome')
plt.xlabel('abs of corr')
plt.ylabel('feature')
plt.show()
選擇相關性最高的'Glucose', 'BMI',做歸一化處理
# 歸一化處理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler_train_data_selected = scaler.fit_transform(train_data_selected)
data_scaled = pd.DataFrame(scaler_train_data_selected, columns=train_data_selected.columns)
訓練邏輯迴歸模型並繪製散點圖
標註:散點圖的繪製函式由GPT完成
train_size = 0.2
x_train, x_test, y_train, y_test = train_test_split(data_scaled, target, train_size = train_size, random_state = 14)
model = LogisticRegression(max_iter=1000, solver='saga', C=0.5)
model.fit(x_train, y_train)
拆分訓練集完成不同size的測試
test_sizes = [0.75, 0.80, 0.85]
for size in test_sizes:
X_train, X_test, Y_train, Y_test = train_test_split(data_scaled, target, train_size = size, random_state = 42)
Y_pred = model.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)
print(f'Train size: {size}, Accuracy: {acc:.4f}')