spark structed streaming 寫入hudi表

hgs19921112發表於2022-03-06
  1. 透過spark-sql建立hudi表

create table if not exists hudi_table3(
  id int,
  name string,
  price double
)  using hudi
options (
  'type' = 'mor',
  'primaryKey' = 'id',
  'hoodie.datasource.hive_sync.enable'='false',
  'hoodie.datasource.meta.sync.enable'='false',
  'hoodie.datasource.write.precombine.field'=price
)

  2. 寫入hudi程式碼

val spark =
  SparkSession.builder()
    .master("local[*]")
    .enableHiveSupport()
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
val df = spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092")
  .option("subscribe", "test")
  .option("group.id","test-1")
  .load()
import spark.implicits._
val query = df
  .selectExpr("split(cast(value as string),',') as sp")
  .selectExpr("cast(sp[0] as int) as id ","sp[1] as name","cast(sp[2] as double) as price")
  .writeStream.format("hudi")
  .trigger(Trigger.ProcessingTime(5000L))
  .option("checkpointLocation","file:///Users/haoguangshi/mysoft/ck")
  .option("path","/Users/haoguangshi/workspace/hudi-lrn/spark-warehouse/hudi_table3")
  // 主鍵相同的話根據該欄位進行判斷需要保留那行資料PRECOMBINE_FIELD_OPT_KEY
  .option("hoodie.datasource.write.precombine.field","price")
  // 表主鍵 RECORDKEY_FIELD_OPT_KEY
  .option("hoodie.datasource.write.recordkey.field","id")
  .start()
query.awaitTermination()
spark.stop()



來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31506529/viewspace-2865291/,如需轉載,請註明出處,否則將追究法律責任。

相關文章