R:梯度提升器

王哲MGG_AI發表於2024-10-14
# 清空環境變數
rm(list = ls())
setwd("C:\\Users\\Administrator\\Desktop\\machine learning\\LSTM")
library(magrittr)  # 提供 %>% 管道運算子
library(keras)     # 提供 Keras 介面
library(dplyr)
library(caret)

set.seed(123) # 確保結果可復現
# 讀取資料
otu <- read.table("genus_otu.txt", sep = "\t", header = TRUE, row.names = 1)
group <- read.table("group.txt", sep = "\t", header = TRUE)
# 合併OTU表和分組資料
otu_transposed <- t(otu) # 轉置OTU表,樣本作為行,菌屬作為列
otu_combined <- merge(group, otu_transposed, by.x = "Sample", by.y = "row.names") # 合併
# 將Gene和Time轉為因子(分類變數)
otu_combined$Gene <- as.factor(otu_combined$Gene)
otu_combined$Time <- as.factor(otu_combined$Time)
# 手動選擇測試集樣本
test_samples <- c("B2W10_1", "B2W8_1", "B2W6_1", "B2W4_1", "M2W10_1", "M2W8_1", "M2W6_1", "M2W4_1")
# "B3W10_1", "B3W8_1", "B3W6_1", "B3W4_1", "M3W10_1", "M3W8_1", "M3W6_1", "M3W4_1",
# 劃分訓練集和測試集
test_data <- otu_combined %>% filter(Sample %in% test_samples)
train_data <- otu_combined %>% filter(!Sample %in% test_samples)

# 載入gbm包
library(gbm)
#移除無變異的特徵
no_variation_cols <- which(apply(train_data[, -1], 2, function(col) length(unique(col)) == 1))
train_data_filtered <- train_data[, -c(1, no_variation_cols + 1)]
test_data_filtered <- test_data[, -c(1, no_variation_cols + 1)]

# 訓練GBT模型,預測基因型
gbm_model_gene <- gbm(
  Gene ~ .,                          # 使用所有特徵預測基因型
  data = train_data_filtered,           # 去掉樣本名稱列
  distribution = "multinomial",      # 多分類任務
  n.trees = 1000,                    # 樹的數量
  interaction.depth = 3,             # 樹的最大深度
  shrinkage = 0.01,                  # 學習率
  cv.folds = 5                       # 5折交叉驗證
)

# 使用測試集進行預測
pred_gene <- predict(gbm_model_gene, newdata = test_data[, -1], n.trees = gbm_model_gene$n.trees, type = "response")
write.table(pred_gene, file = "pred_gene.txt", sep = "\t", row.names = TRUE, col.names = NA)

# 將預測結果轉換為分類標籤
pred_gene_class <- apply(pred_gene, 1, function(x) colnames(pred_gene)[which.max(x)])
write.table(pred_gene_class, file = "pred_gene_class.txt", sep = "\t", row.names = TRUE, col.names = NA)

# 計算準確率
confusionMatrix(as.factor(pred_gene_class), test_data$Gene)

# 特徵重要性分析
importance_gene <- summary(gbm_model_gene)
write.table(importance_gene, file = "results.txt", sep = "\t", row.names = TRUE, col.names = NA)

# 訓練GBT模型,預測時間點
gbm_model_time <- gbm(
  Time ~ .,                          # 使用所有特徵預測時間點
  data = train_data[, -1],           # 去掉樣本名稱列
  distribution = "multinomial",      # 多分類任務
  n.trees = 1000,                    # 樹的數量
  interaction.depth = 3,             # 樹的最大深度
  shrinkage = 0.01,                  # 學習率
  cv.folds = 5                       # 5折交叉驗證
)
# 檢視模型輸出
summary(gbm_model_time)

pred_time <- predict(gbm_model_time, newdata = test_data[, -1], n.trees = gbm_model_time$n.trees, type = "response")
pred_time_class <- apply(pred_time, 1, function(x) colnames(pred_time)[which.max(x)])
confusionMatrix(as.factor(pred_time_class), test_data$Time)

# 特徵重要性分析
importance_time <- summary(gbm_model_time)
print(importance_time)

相關文章