《Spark機器學習》筆記——Spark構建聚類模型

土豆拍死馬鈴薯發表於2018-01-16
import breeze.plot.{Figure, hist, plot}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.apache.spark.{SparkConf, SparkContext}

object 聚類 {
  def main(args: Array[String]): Unit = {
    //連線SparkMaster
    val conf = new SparkConf().setAppName("Spark機器學習:聚類").setMaster("local")
    val sc = new SparkContext(conf)

    val movies = sc.textFile("file:///home/chenjie/ml-100k/u.item")
    println(movies.first())
    //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0

    val genres = sc.textFile("file:///home/chenjie/ml-100k/u.genre")
    genres.foreach(println)
    /*unknown|0
    Action|1
    Adventure|2
    Animation|3
    Children's|4
    Comedy|5
    Crime|6
    Documentary|7
    Drama|8
    Fantasy|9
    Film-Noir|10
    Horror|11
    Musical|12
    Mystery|13
    Romance|14
    Sci-Fi|15
    Thriller|16
    War|17
    Western|18*/

    val genreMap = genres.filter(! _.isEmpty)
      .map(line => line.split("\\|"))
      .map(array => (array(1), array(0)))
      .collectAsMap()
    println(genreMap)
    //Map(2 -> Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, 7 -> Documentary, 17 -> War, 1 -> Action, 4 -> Children's, 11 -> Horror, 14 -> Romance, 6 -> Crime, 0 -> unknown, 9 -> Fantasy, 16 -> Thriller, 3 -> Animation, 10 -> Film-Noir, 13 -> Mystery)

    val titlesAndGenres = movies.map(_.split("\\|")).map{ array =>
      //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
      val genres = array.toSeq.slice(5, array.size)
      //0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
      val genresAssigned = genres.zipWithIndex.filter{  case (g, idx) =>
        //  g:0|0|0|1|1|1|0|0|0|0| 0| 0| 0| 0| 0| 0| 0| 0| 0
        //idx:0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18
        g == "1"
      }.map{  case (g, idx) =>
        //  g:|1|1|1
        //idx:|3|4|5
        //3 -> Animation  4 -> Children's 5 -> Comedy
        genreMap(idx.toString)
        //Animation Children's Comedy
      }
      (array(0).toInt, (array(1), genresAssigned))
      //(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))
    }
    println(titlesAndGenres.first())
    //(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))

    val rawData = sc.textFile("file:///home/chenjie/ml-100k/u.data")
    val rawRatings = rawData.map(_.split("\t").take(3))
    val ratings = rawRatings.map{ case Array(user, movie, rating) =>
      Rating(user.toInt, movie.toInt, rating.toDouble)
    }
    ratings.cache()
    val alsModel = ALS.train(ratings, 50, 10, 0.1)

    //最小二乘法返回兩個鍵值RDD user-Features 和 product-Features
    //鍵分別是使用者ID或者電影ID,值為相關因素
    //現在提取相關因素並轉換到MLlib的Vector中作為聚類模型的輸入

    val movieFactors = alsModel.productFeatures.map{  case (id, factor) =>
      (id, Vectors.dense(factor))
    }
    val movieVectors = movieFactors.map(_._2)
    val userFactors = alsModel.userFeatures.map{  case (id, factor) =>
      (id, Vectors.dense(factor))
    }
    val userVectors = userFactors.map(_._2)

    val movieMatrix = new RowMatrix(movieVectors)
    val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()
    val userMatrix = new RowMatrix(userVectors)
    val userMatrixSummary = userMatrix.computeColumnSummaryStatistics()
    println("Movie factors mean :" + movieMatrixSummary.mean)
    println("Movie factors variance :" + movieMatrixSummary.variance)
    println("User factors mean :" + userMatrixSummary.mean)
    println("User factors variance :" + userMatrixSummary.variance)
    //觀察輸入資料的相關因素特徵向量的分佈,以便判斷是否需要進行歸一化
    //沒有發現特別的離群點,則不會影響聚類結果,因此沒有必要進行歸一化

    val numCluster = 5//K
    val numIterations = 10//最大迭代次數
    val numRuns = 3//訓練次數

    val movieClusterModel = KMeans.train(movieVectors, numCluster, numIterations, numRuns)
    println(movieClusterModel)

    val movieClusterModelConverged = KMeans.train(movieVectors, numCluster, numIterations, 100)
    println(movieClusterModelConverged)

    //7、4 使用聚類模型進行預測
    val movie1 = movieVectors.first()
    val movieCluster = movieClusterModel.predict(movie1)
    println("預測一個:" + movieCluster)

    val predictions = movieClusterModel.predict(movieVectors)
    println("預測一堆:" + predictions.take(5).mkString(","))

    import breeze.linalg._
    import breeze.numerics.pow
    def computeDistance(v1: DenseVector[Double], v2: DenseVector[Double]) = pow(v1 - v2, 2).sum

    //用MovieLens資料集解釋類別預測
    val titlesWithFactors = titlesAndGenres.join(movieFactors)
    val movieAssigned = titlesWithFactors.map{  case (id, ((title, genres), vector)) =>
      val pred = movieClusterModel.predict(vector)
      val clusterCentre = movieClusterModel.clusterCenters(pred)
      val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray))
      (id, title, genres.mkString(" "), pred, dist)
    }
    val clusterAssignments = movieAssigned.groupBy{ case (id, title, genres, cluster, dist) => cluster}
      .collectAsMap()

    for ( (k, v) <- clusterAssignments.toSeq.sortBy(_._1)){
      println(s"Cluster $k")
      val m = v.toSeq.sortBy(_._5)
      println(m.take(20).map{ case (_, title, genres, _, d) =>
        (title, genres, d)
      }.mkString("\n"))
      println("========\n")
    }

    //7、5 評估聚類模型的效能
    //7、5、1 內部評價指標
    //WCSS Davies-Bouldin指數 Dunn指數 輪廓係數
    //7、5、2 外部評價指標
    // Rand measure、 F-measure、Kaccard index等
    //7、5、3 使用MLlib提供的函式
    val movieCost = movieClusterModel.computeCost(movieVectors)
    println("WCSS for movies : " + movieCost)

    //7、6 聚類模型引數調優

    //通過交叉驗證選擇K
    val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6, 0.4), 123)
    val trainMovies = trainTestSplitMovies(0)
    val testMovies = trainTestSplitMovies(1)
    val costsMovies = Seq(2, 3, 4, 5, 10, 20).map{  k => (k, KMeans.train(trainMovies, numIterations, k, numRuns).computeCost(testMovies))}
    println("Movie clustering cross-validation:")
    costsMovies.foreach{  case (k, cost) => println(f"WCSS for K=$k id $cost%2.4f")}

    val x_p_1 = costsMovies.map{ case(value,count) => value.toInt}.toSeq
    val y_p_1 = costsMovies.map{ case(value,count) => count.toInt}.toSeq

    val f = Figure()
    val p1 = f.subplot(2,1,0)//2行1列第0個
    p1.title = "Movies'WCSS隨聚類中心數目K變化圖"
    p1 += plot(x_p_1, y_p_1)
    p1.xlabel = "聚類中心數目K"
    p1.ylabel = "WCSS"

    val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123)
    val trainUsers = trainTestSplitMovies(0)
    val testUsers = trainTestSplitMovies(1)
    val costsUsers = Seq(2, 3, 4, 5, 10, 20).map{  k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers))}
    println("Users clustering cross-validation:")
    costsUsers.foreach{  case (k, cost) => println(f"WCSS for K=$k id $cost%2.4f")}

    val x_p_2 = costsUsers.map{ case(value,count) => value.toInt}.toSeq
    val y_p_2 = costsUsers.map{ case(value,count) => count.toInt}.toSeq

    val p2 = f.subplot(2,1,1)//2行1列第0個
    p2.title = "Users'WCSS隨聚類中心數目K變化圖"
    p2 += plot(x_p_2, y_p_2)
    p2.xlabel = "聚類中心數目K"
    p2.ylabel = "WCSS"
  }
}



相關文章