spark 批次寫入redis控制

唐钰逍遥發表於2024-12-05

需求

spark scala 程式 讀取hive資料,sparksql解析 然後分批寫入redis

原則:

一、儘可能少和redis互動,減少redis寫入壓力。

二、可以考慮pipeline的操作習慣。

三、不要一個partition只提交一個pipeline

  1. 網路傳輸壓力大

  2. 記憶體消耗高

  3. Redis服務端處理壓力大

  4. 可能增加命令處理延遲

實現方案

import org.apache.spark.sql.{DataFrame, SparkSession}
import redis.clients.jedis.{Jedis, Pipeline}
import scala.collection.JavaConverters._

object OptimizedHiveToRedisPipeline {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("OptimizedHiveToRedisPipeline")
      .enableHiveSupport()
      .getOrCreate()

    val redisHost = "your_redis_host"
    val redisPort = 6379
    val redisPassword = "your_redis_password" // 可選

    val hiveTableDF = spark.sql(
      """
      SELECT 
        user_id, 
        name, 
        age, 
        total_spend,
        date_partition
      FROM your_hive_database.your_hive_table
      WHERE date_partition = '2024-01-01'
      """
    )

    val processedDF = hiveTableDF.select(
      "user_id", 
      "name", 
      "age", 
      "total_spend"
    )

    // 定義批處理大小常量
    val BATCH_SIZE = 1000 // 每批次處理1000條記錄

    def writeToRedisBatch(records: Iterator[org.apache.spark.sql.Row]): Unit = {
      val jedis = new Jedis(redisHost, redisPort)
      
      try {
        // 將記錄分批處理
        records.grouped(BATCH_SIZE).foreach { batch =>
          val pipeline = jedis.pipelined()
          
          batch.foreach { record =>
            val userId = record.getString(0)
            val key = s"user:${userId}"
            
            pipeline.hset(key, "name", record.getString(1))
            pipeline.hset(key, "age", record.getInt(2).toString)
            pipeline.hset(key, "total_spend", record.getDouble(3).toString)
          }
          
          // 對每個小批次執行sync
          pipeline.sync()
        }
      } catch {
        case e: Exception => 
          println(s"Redis寫入錯誤: ${e.getMessage}")
      } finally {
        jedis.close()
      }
    }

    // 分批寫入Redis,控制每批次大小
    processedDF.rdd.foreachPartition(writeToRedisBatch)

    spark.stop()
  }
}

相關文章