VectorAssembler欄位轉換成特徵向量
import org.apache.spark.ml.feature.VectorAssembler val colArray = Array("age", "yearsmarried", "religiousness", "education", "occupation", "rating") // 欄位轉換成特徵向量 val assembler = new VectorAssembler().setInputCols(colArray).setOutputCol("features") val vecDF: DataFrame = assembler.transform(data) vecDF: org.apache.spark.sql.DataFrame = [affairs: double, gender: string ... 8 more fields] vecDF.select("features", colArray: _*).show(10, truncate = false) +----------------------------+----+------------+-------------+---------+----------+------+ |features |age |yearsmarried|religiousness|education|occupation|rating| +----------------------------+----+------------+-------------+---------+----------+------+ |[37.0,10.0,3.0,18.0,7.0,4.0]|37.0|10.0 |3.0 |18.0 |7.0 |4.0 | |[27.0,4.0,4.0,14.0,6.0,4.0] |27.0|4.0 |4.0 |14.0 |6.0 |4.0 | |[32.0,15.0,1.0,12.0,1.0,4.0]|32.0|15.0 |1.0 |12.0 |1.0 |4.0 | |[57.0,15.0,5.0,18.0,6.0,5.0]|57.0|15.0 |5.0 |18.0 |6.0 |5.0 | |[22.0,0.75,2.0,17.0,6.0,3.0]|22.0|0.75 |2.0 |17.0 |6.0 |3.0 | |[32.0,1.5,2.0,17.0,5.0,5.0] |32.0|1.5 |2.0 |17.0 |5.0 |5.0 | |[22.0,0.75,2.0,12.0,1.0,3.0]|22.0|0.75 |2.0 |12.0 |1.0 |3.0 | |[57.0,15.0,2.0,14.0,4.0,4.0]|57.0|15.0 |2.0 |14.0 |4.0 |4.0 | |[32.0,15.0,4.0,16.0,1.0,2.0]|32.0|15.0 |4.0 |16.0 |1.0 |2.0 | |[22.0,1.5,4.0,14.0,4.0,5.0] |22.0|1.5 |4.0 |14.0 |4.0 |5.0 | +----------------------------+----+------------+-------------+---------+----------+------+ only showing top 10 rows
VectorIndexer自動識別分類的特徵,並對它們進行索引
import org.apache.spark.ml.feature.VectorIndexer val colArray = Array("age", "yearsmarried", "religiousness", "education", "occupation", "rating") // 自動識別分類的特徵,並對它們進行索引 // 具有大於7個不同的值的特徵被視為連續。 val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(7) .fit(vecDF) val categoricalFeatures: Set[Int] = featureIndexer.categoryMaps.keys.toSet categoricalFeatures: Set[Int] = Set(2, 3, 4, 5) println(s"Chose ${categoricalFeatures.size} categorical features: " + categoricalFeatures.mkString(", ")) Chose 4 categorical features: 2, 3, 4, 5 // 由此看出,當MaxCategories=7,從6個欄位中識別出了4個“類別特徵欄位”, // 他們的下標索引為(2, 3, 4, 5),分別對應colArray中的(2, 3, 4, 5)元素,即"religiousness", "education", "occupation", "rating" // 為什麼識別出了4個“類別特徵欄位”呢,請看本人部落格http://www.cnblogs.com/wwxbi/p/6125363.html“統計欄位中元素的個數” // 從“統計欄位中元素的個數”看出,("religiousness", "education", "occupation", "rating")這4個欄位的元素個數<=7 // Create new column "indexedFeatures" with categorical values transformed to indices val indexedData = featureIndexer.transform(vecDF) indexedData: org.apache.spark.sql.DataFrame = [affairs: double, gender: string ... 9 more fields] val resColArray = Array("indexedFeatures", "features", "age", "yearsmarried", "religiousness", "education", "occupation", "rating") resColArray: Array[String] = Array(indexedFeatures, features, age, yearsmarried, religiousness, education, occupation, rating) indexedData.selectExpr(resColArray: _*).show(10, truncate = false) +---------------------------+----------------------------+----+------------+-------------+---------+----------+------+ |indexedFeatures |features |age |yearsmarried|religiousness|education|occupation|rating| +---------------------------+----------------------------+----+------------+-------------+---------+----------+------+ |[37.0,10.0,2.0,5.0,6.0,3.0]|[37.0,10.0,3.0,18.0,7.0,4.0]|37.0|10.0 |3.0 |18.0 |7.0 |4.0 | |[27.0,4.0,3.0,2.0,5.0,3.0] |[27.0,4.0,4.0,14.0,6.0,4.0] |27.0|4.0 |4.0 |14.0 |6.0 |4.0 | |[32.0,15.0,0.0,1.0,0.0,3.0]|[32.0,15.0,1.0,12.0,1.0,4.0]|32.0|15.0 |1.0 |12.0 |1.0 |4.0 | |[57.0,15.0,4.0,5.0,5.0,4.0]|[57.0,15.0,5.0,18.0,6.0,5.0]|57.0|15.0 |5.0 |18.0 |6.0 |5.0 | |[22.0,0.75,1.0,4.0,5.0,2.0]|[22.0,0.75,2.0,17.0,6.0,3.0]|22.0|0.75 |2.0 |17.0 |6.0 |3.0 | |[32.0,1.5,1.0,4.0,4.0,4.0] |[32.0,1.5,2.0,17.0,5.0,5.0] |32.0|1.5 |2.0 |17.0 |5.0 |5.0 | |[22.0,0.75,1.0,1.0,0.0,2.0]|[22.0,0.75,2.0,12.0,1.0,3.0]|22.0|0.75 |2.0 |12.0 |1.0 |3.0 | |[57.0,15.0,1.0,2.0,3.0,3.0]|[57.0,15.0,2.0,14.0,4.0,4.0]|57.0|15.0 |2.0 |14.0 |4.0 |4.0 | |[32.0,15.0,3.0,3.0,0.0,1.0]|[32.0,15.0,4.0,16.0,1.0,2.0]|32.0|15.0 |4.0 |16.0 |1.0 |2.0 | |[22.0,1.5,3.0,2.0,3.0,4.0] |[22.0,1.5,4.0,14.0,4.0,5.0] |22.0|1.5 |4.0 |14.0 |4.0 |5.0 | +---------------------------+----------------------------+----+------------+-------------+---------+----------+------+ only showing top 10 rows import org.apache.spark.ml.feature.VectorSlicer val slicer = new VectorSlicer().setInputCol("indexedFeatures").setOutputCol("slicerFeatures") slicer.setIndices(Array(3)) // 此處的3對應“索引化”之前的欄位“education” val output = slicer.transform(indexedData) output.select("indexedFeatures", "slicerFeatures", "education").limit(10).orderBy($"education").show(10, truncate = false) +---------------------------+--------------+---------+ |indexedFeatures |slicerFeatures|education| +---------------------------+--------------+---------+ |[32.0,15.0,0.0,1.0,0.0,3.0]|[1.0] |12.0 | |[22.0,0.75,1.0,1.0,0.0,2.0]|[1.0] |12.0 | |[27.0,4.0,3.0,2.0,5.0,3.0] |[2.0] |14.0 | |[57.0,15.0,1.0,2.0,3.0,3.0]|[2.0] |14.0 | |[22.0,1.5,3.0,2.0,3.0,4.0] |[2.0] |14.0 | |[32.0,15.0,3.0,3.0,0.0,1.0]|[3.0] |16.0 | |[32.0,1.5,1.0,4.0,4.0,4.0] |[4.0] |17.0 | |[22.0,0.75,1.0,4.0,5.0,2.0]|[4.0] |17.0 | |[37.0,10.0,2.0,5.0,6.0,3.0]|[5.0] |18.0 | |[57.0,15.0,4.0,5.0,5.0,4.0]|[5.0] |18.0 | +---------------------------+--------------+---------+ // 由此看出,“類別特徵欄位”被索引化後,索引的編號是跟“原欄位值的大小順序”對照的,索引從0開始 // 索引編號(0,1,2,3,4,5,6)對應[9.0, 12.0, 14.0, 16.0, 17.0, 18.0, 20.0]
VectorSlicer向量切割
import org.apache.spark.ml.feature.VectorSlicer val colArray = Array("age", "yearsmarried", "religiousness", "education", "occupation", "rating") // 欄位轉換成特徵向量 val assembler = new VectorAssembler().setInputCols(colArray).setOutputCol("features") val vecDF = assembler.transform(data) val slicer = new VectorSlicer().setInputCol("features").setOutputCol("slicerFeatures") // 指定“向量欄位features”中的下標索引 // (2, 3, 4)分別對應欄位("religiousness", "education", "occupation") slicer.setIndices(Array(2, 3, 4)) val output = slicer.transform(vecDF) output.select("features", "slicerFeatures","religiousness", "education", "occupation").show(10, truncate = false) +----------------------------+--------------+-------------+---------+----------+ |features |slicerFeatures|religiousness|education|occupation| +----------------------------+--------------+-------------+---------+----------+ |[37.0,10.0,3.0,18.0,7.0,4.0]|[3.0,18.0,7.0]|3.0 |18.0 |7.0 | |[27.0,4.0,4.0,14.0,6.0,4.0] |[4.0,14.0,6.0]|4.0 |14.0 |6.0 | |[32.0,15.0,1.0,12.0,1.0,4.0]|[1.0,12.0,1.0]|1.0 |12.0 |1.0 | |[57.0,15.0,5.0,18.0,6.0,5.0]|[5.0,18.0,6.0]|5.0 |18.0 |6.0 | |[22.0,0.75,2.0,17.0,6.0,3.0]|[2.0,17.0,6.0]|2.0 |17.0 |6.0 | |[32.0,1.5,2.0,17.0,5.0,5.0] |[2.0,17.0,5.0]|2.0 |17.0 |5.0 | |[22.0,0.75,2.0,12.0,1.0,3.0]|[2.0,12.0,1.0]|2.0 |12.0 |1.0 | |[57.0,15.0,2.0,14.0,4.0,4.0]|[2.0,14.0,4.0]|2.0 |14.0 |4.0 | |[32.0,15.0,4.0,16.0,1.0,2.0]|[4.0,16.0,1.0]|4.0 |16.0 |1.0 | |[22.0,1.5,4.0,14.0,4.0,5.0] |[4.0,14.0,4.0]|4.0 |14.0 |4.0 | +----------------------------+--------------+-------------+---------+----------+ only showing top 10 rows output.printSchema() root |-- affairs: double (nullable = false) |-- gender: string (nullable = true) |-- age: double (nullable = false) |-- yearsmarried: double (nullable = false) |-- children: string (nullable = true) |-- religiousness: double (nullable = false) |-- education: double (nullable = false) |-- occupation: double (nullable = false) |-- rating: double (nullable = false) |-- features: vector (nullable = true) |-- slicerFeatures: vector (nullable = true)
Bucketizer將連續資料離散化到指定的範圍區間
import org.apache.spark.ml.feature.Bucketizer // Double.NegativeInfinity:負無窮;Double.PositiveInfinity:正無窮 // 分為6個組:[負無窮,-100),[-100,-10),[-10,0),[0,10),[10,90),[90,正無窮) val splits = Array(Double.NegativeInfinity, -100, -10, 0.0, 10, 90, Double.PositiveInfinity) val data: Array[Double] = Array(-180,-160,-100,-50,-70,-20,-8,-5,-3, 0.0, 1,3,7,10,30,60,90,100,120,150) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") dataFrame: org.apache.spark.sql.DataFrame = [features: double] val bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) // 將原始資料轉換為桶索引 val bucketedData = bucketizer.transform(dataFrame) bucketedData: org.apache.spark.sql.DataFrame = [features: double, bucketedFeatures: double] bucketedData.show(50,truncate=false) +--------+----------------+ |features|bucketedFeatures| +--------+----------------+ |-180.0 |0.0 | |-160.0 |0.0 | |-100.0 |1.0 | |-50.0 |1.0 | |-70.0 |1.0 | |-20.0 |1.0 | |-8.0 |2.0 | |-5.0 |2.0 | |-3.0 |2.0 | |0.0 |3.0 | |1.0 |3.0 | |3.0 |3.0 | |7.0 |3.0 | |10.0 |4.0 | |30.0 |4.0 | |60.0 |4.0 | |90.0 |5.0 | |100.0 |5.0 | |120.0 |5.0 | |150.0 |5.0 | +--------+----------------+