接著案例一,我們再使用另一種方法例項一個案例
直接上程式碼:
#!/usr/bin/Rscript library(plyr) library(reshape2) #1、根據訓練集建立樸素貝葉斯分類器 #1.1、生成類別的機率 ##計算訓練集合D中類別出現的機率,即P{c_i} ##輸入:trainData 訓練集,型別為資料框 ## strClassName 指明訓練集中名稱為 strClassName列為分類結果 ##輸出:資料框,P{c_i}的集合,類別名稱|機率(列名為 prob) class_prob <- function(trainData, strClassName){ #訓練集樣本數 #nrow返回行數 length.train <- nrow(trainData) dTemp <- ddply(trainData, strClassName, "nrow") dTemp <- ddply(dTemp, strClassName, mutate, prob = nrow/length.train) dTemp[,-2] } ##1.2、生成每個類別下,特徵取不同值的機率 ##計算訓練集合D中,生成每個類別下,特徵取不同值的機率,即P{fi|c_i} ##輸入:trainData 訓練集,型別為資料框 ## strClassName 指明訓練集中名稱為strClassName列為分類結果,其餘的全部列認為是特徵值 ##輸出:資料框,P{fi|c_i}的集合,類別名稱|特徵名稱|特徵取值|機率(列名為 prob) feature_class_prob <- function(trainData, strClassName){ # 橫錶轉換為縱表 data.melt <- melt(trainData,id=c(strClassName)) # 統計頻數 aa <- ddply(data.melt, c(strClassName,"variable","value"), "nrow") # 計算機率 bb <- ddply(aa, c(strClassName,"variable"), mutate, sum = sum(nrow), prob = nrow/sum) # 增加列名 colnames(bb) <- c("class.name", "feature.name", "feature.value", "feature.nrow", "feature.sum", "prob") # 返回結果 bb[,c(1,2,3,6)] } ## 以上建立完樸素貝葉斯分類器
## 2、使用生成的樸素貝葉斯分類器進行預測 ##使用生成的樸素貝葉斯分類器進行預測P{fi|c_i} ##輸入:oneObs 資料框,待預測的樣本,格式為 特徵名稱|特徵值 ## pc 資料框,訓練集合D中類別出現的機率,即P{c_i} 類別名稱|機率 ## pfc 資料框,每個類別下,特徵取不同值的機率,即P{fi|c_i} ## 類別名稱|特徵名稱|特徵值|機率 ##輸出:資料框,待預測樣本的分類對每個類別的機率,類別名稱|後驗機率(列名為 prob) pre_class <- function(oneObs, pc,pfc){ colnames(oneObs) <- c("feature.name", "feature.value") colnames(pc) <- c("class.name","prob") colnames(pfc) <- c("class.name","feature.name","feature.value","prob") # 取出特徵的取值的條件機率 feature.all <- join(oneObs,pfc,by=c("feature.name","feature.value"),type="inner") # 取出特徵取值的條件機率連乘 feature.prob <- ddply(feature.all,.(class.name),summarize,prob_fea=prod(prob)) #prod為連乘函式 #取出類別的機率 class.all <- join(feature.prob,pc,by="class.name",type="inner") #輸出結果 ddply(class.all,.(class.name),mutate,pre_prob=prob_fea*prob)[,c(1,4)] } ##3、資料測試 ##用上面蘋果的資料作為例子進行測試 #訓練集 train.apple <-data.frame( size=c("大","小","大","大","小","小"), weight=c("輕","重","輕","輕","重","輕"), color=c("紅","紅","紅","綠","紅","綠"), taste=c("good","good","bad","bad","bad","good") ) #待預測樣本 oneObs<-data.frame( feature.name =c("size", "weight", "color"), feature.value =c("大","重","紅") ) #預測分類 pc <- class_prob(train.apple,"taste") pfc <- feature_class_prob(train.apple,"taste") pre_class(oneObs, pc, pfc)
預測結果為:
class.name pre_prob
1 bad 0.07407407
2 good 0.03703704
可見該蘋果的口味為:bad
*********************************************這裡是分割線****************************************************
我們使用這個方法再預測一下案例一中的資料集。
#資料集樣本 data <- data.frame(c("sunny","hot","high","weak","no", "sunny","hot","high","strong","no", "overcast","hot","high","weak","yes", "rain","mild","high","weak","yes", "rain","cool","normal","weak","yes", "rain","cool","normal","strong","no", "overcast","cool","normal","strong","yes", "sunny","mild","high","weak","no", "sunny","cool","normal","weak","yes", "rain","mild","normal","weak","yes", "sunny","mild","normal","strong","yes", "overcast","mild","high","strong","yes", "overcast","hot","normal","weak","yes", "rain","mild","high","strong","no"), byrow = TRUE, dimnames = list(day = c(),condition = c("outlook","temperature","humidity","wind","playtennis")), nrow=14, ncol=5); #待預測樣本 ddata<-data.frame( feature.name =c("outlook", "temperature","humidity","wind"), feature.value =c("overcast","mild","normal","weak") ) #預測分類 pc <- class_prob(data,"playtennis") pfc <- feature_class_prob(data,"playtennis") pre_class(ddata, pc, pfc)
預測結果為:
class.name pre_prob
1 no 0.02666667
2 yes 0.13168724
預測結果為:yes,可見與案例一的結果一樣。