需求
spark scala 程式 讀取hive資料,sparksql解析 然後分批寫入redis
原則:
一、儘可能少和redis互動,減少redis寫入壓力。
二、可以考慮pipeline的操作習慣。
三、不要一個partition只提交一個pipeline
網路傳輸壓力大
記憶體消耗高
Redis服務端處理壓力大
可能增加命令處理延遲
實現方案
import org.apache.spark.sql.{DataFrame, SparkSession}
import redis.clients.jedis.{Jedis, Pipeline}
import scala.collection.JavaConverters._
object OptimizedHiveToRedisPipeline {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("OptimizedHiveToRedisPipeline")
.enableHiveSupport()
.getOrCreate()
val redisHost = "your_redis_host"
val redisPort = 6379
val redisPassword = "your_redis_password" // 可選
val hiveTableDF = spark.sql(
"""
SELECT
user_id,
name,
age,
total_spend,
date_partition
FROM your_hive_database.your_hive_table
WHERE date_partition = '2024-01-01'
"""
)
val processedDF = hiveTableDF.select(
"user_id",
"name",
"age",
"total_spend"
)
// 定義批處理大小常量
val BATCH_SIZE = 1000 // 每批次處理1000條記錄
def writeToRedisBatch(records: Iterator[org.apache.spark.sql.Row]): Unit = {
val jedis = new Jedis(redisHost, redisPort)
try {
// 將記錄分批處理
records.grouped(BATCH_SIZE).foreach { batch =>
val pipeline = jedis.pipelined()
batch.foreach { record =>
val userId = record.getString(0)
val key = s"user:${userId}"
pipeline.hset(key, "name", record.getString(1))
pipeline.hset(key, "age", record.getInt(2).toString)
pipeline.hset(key, "total_spend", record.getDouble(3).toString)
}
// 對每個小批次執行sync
pipeline.sync()
}
} catch {
case e: Exception =>
println(s"Redis寫入錯誤: ${e.getMessage}")
} finally {
jedis.close()
}
}
// 分批寫入Redis,控制每批次大小
processedDF.rdd.foreachPartition(writeToRedisBatch)
spark.stop()
}
}