SparkMLlib機器學習 java版本

蝸牛.~發表於2020-11-26

下面介紹sparkML中常用的演算法與demo的使用方式,使用方式提供javaApi

pom依賴

這裡使用的版本spark為2.0.0 scala為2.11可以根據自己的需求調整


        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib_2.11</artifactId>
            <version>2.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.12</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.0.0</version>
        </dependency>

支援向量機演算法

import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.classification.SVMModel;
import org.apache.spark.mllib.classification.SVMWithSGD;
import org.apache.spark.mllib.regression.LabeledPoint;

import java.util.ArrayList;
import java.util.List;


/**
 * spark 支援向量機
 * svm為2分類
 * 資料的標籤標籤為0或者1
 * 識別的結果為離散數值,更靠近的分類為識別的結果,不清除閾值會導致生成的結果為0或者1,誤差較大
 */
public class SVM {
    public static void main(String[] args) {
        SparkSession spark = SparkSession.builder().appName("test").master("local[2]").getOrCreate();
        //構造原始假資料資料
        List<LabeledPoint> labeledPoints = new ArrayList<>();
        for (int i = 0; i < 100000; i++) {//svm為2分類 數值標籤為0或者1,
            Vector vector1 = new DenseVector(new double[]{getRandom(0D, 0.1), getRandom(0D, 0.1)});
            LabeledPoint labeledPoint1 = new LabeledPoint(0, vector1);//設定標籤為0
            Vector vector2 = new DenseVector(new double[]{getRandom(0.9D, 1.0), getRandom(0.9D, 1.0)});
            LabeledPoint labeledPoint2 = new LabeledPoint(1, vector2);//設定標籤為1
            labeledPoints.add(labeledPoint1);
            labeledPoints.add(labeledPoint2);
        }
        SparkContext sc = spark.sparkContext();
        sc.setLogLevel("ERROR");//關閉日誌
        JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
        JavaRDD<LabeledPoint> data = jsc.parallelize(labeledPoints);

        //將初始RDD分為兩個 [60%訓練資料,40%測試資料]
        JavaRDD<LabeledPoint>[] javaRDDS = data.randomSplit(new double[]{0.6, 0.4}, 11L);
        JavaRDD<LabeledPoint> train = javaRDDS[0];
        JavaRDD<LabeledPoint> test = javaRDDS[0];

        int numIterations = 10000;//迭代次數
        SVMModel model = SVMWithSGD.train(train.rdd(), numIterations);//model需要廣播到每一臺伺服器中,這裡本地除錯忽略
        model.clearThreshold();//清除預設閾值,這裡不清楚預設閾值會導致資料偏差過大
        JavaRDD<Tuple2<Double, Double>> scoreAndLabels = test.map(p -> new Tuple2<>(model.predict(p.features()), p.label())); //計算測試集上的原始分數。
        long count = scoreAndLabels.count();
        Double reduce = scoreAndLabels.map(f -> Math.abs(f._1 - f._2)).reduce((Function2<Double, Double, Double>) (v1, v2) -> v1 + v2);

        System.out.println("誤差" + reduce/count);
        for (LabeledPoint labeledPoint : test.take(10)) {
            System.out.println(labeledPoint.features() + ":" + labeledPoint.label() + "=>" + model.predict(labeledPoint.features()));
        }
    }

    /**
     * 傳入隨機數變化範圍獲取隨機數
     *
     * @param min 隨機數最小是
     * @param max 隨機數最大值
     * @return
     */
    public static Double getRandom(Double min, Double max) {
        return Math.random() * (max - min) + min;
    }
}

線性迴歸


import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.regression.LinearRegressionModel;
import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
import org.apache.spark.sql.SparkSession;

import java.util.ArrayList;
import java.util.List;


/**
 * spark 線性迴歸
 * 邏輯迴歸為多分類
 * 資料的標籤標籤為[0,n]之間的整數
 */
public class LogisticRegression {
    public static void main(String[] args) {
        SparkSession spark = SparkSession.builder().appName("test").master("local[2]").getOrCreate();
        //構造原始假資料資料
        List<LabeledPoint> labeledPoints = new ArrayList<>();
        for (int i = -5; i < 100; i++) {//svm為2分類 數值標籤為0或者1,
            Double x = i * 1.0;
            Double y = x * 12 - 10 * (Math.random()-0.5);
            Vector vector1 = new DenseVector(new double[]{x});
            LabeledPoint labeledPoint1 = new LabeledPoint(y, vector1);//設定標籤為0
            labeledPoints.add(labeledPoint1);
        }
        SparkContext sc = spark.sparkContext();
        sc.setLogLevel("ERROR");//關閉日誌
        JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
        JavaRDD<LabeledPoint> data = jsc.parallelize(labeledPoints);

        //將初始RDD分為兩個 [60%訓練資料,40%測試資料]
        JavaRDD<LabeledPoint>[] javaRDDS = data.randomSplit(new double[]{0.6, 0.4}, 11L);
        JavaRDD<LabeledPoint> train = javaRDDS[0];
        JavaRDD<LabeledPoint> test = javaRDDS[0];

        int numIterations = 20000;//迭代次數
        double stepSize = 0.003;//學習率
        LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(train), numIterations, stepSize);

        for (LabeledPoint labeledPoint : test.take(20)) {
            System.out.println(labeledPoint.features() + ":" + labeledPoint.label() + "=>" + model.predict(labeledPoint.features()));
        }
    }
}

 

相關文章