二、flink在批處理中常見的sink
1.基於本地集合的sink(Collection-based-sink)
2.基於檔案的sink(File-based-sink)
1.基於本地集合的sink(Collection-based-sink)
package code.book.batch.sinksource.scala
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
object DataSink000 {
def main(args: Array[String]): Unit = {
//1.定義環境
val env = ExecutionEnvironment.getExecutionEnvironment
//2.定義資料 stu(age,name,height)
val stu: DataSet[(Int, String, Double)] = env.fromElements(
(19, "zhangsan", 178.8),
(17, "lisi", 168.8),
(18, "wangwu", 184.8),
(21, "zhaoliu", 164.8)
)
//3.sink到標準輸出
stu.print
//3.sink到標準error輸出
stu.printToErr()
//4.sink到本地Collection
print(stu.collect())
}
}
2.基於檔案的sink(File-based-sink)
flink支援多種儲存裝置上的檔案,包括本地檔案,hdfs檔案,alluxio檔案等。
flink支援多種檔案的儲存格式,包括text檔案,CSV檔案等。
package code.book.batch.sinksource.scala
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
import org.apache.flink.core.fs.FileSystem.WriteMode
object DataSink001 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val ds1: DataSet[(Int, String)] = env.fromCollection(Map(1 -> "spark", 2 -> "flink"))
ds1.setParallelism(1).writeAsText("file:///output/flink/datasink/test001.txt",
WriteMode.OVERWRITE)
env.execute()
ds1.setParallelism(1).writeAsText("hdfs:///output/flink/datasink/test001.txt",
WriteMode.OVERWRITE)
env.execute()
val inPath = "hdfs:///input/flink/sales.csv"
case class Sales(transactionId: String, customerId: Int, itemId: Int, amountPaid: Double)
val ds2 = env.readCsvFile[Sales](
filePath = inPath,
lineDelimiter = "\n",
fieldDelimiter = ",",
lenient = false,
ignoreFirstLine = true,
includedFields = Array(0, 1, 2, 3),
pojoFields = Array("transactionId", "customerId", "itemId", "amountPaid")
)
val outPath = "hdfs:///output/flink/datasink/sales.csv"
ds2.setParallelism(1).writeAsCsv(filePath = outPath, rowDelimiter = "\n",
fieldDelimiter = "|", WriteMode.OVERWRITE)
env.execute()
}
}
3.基於檔案的sink(資料進行排序)
可以使用sortPartition對資料進行排序後再sink到外部系統。
執行程式
package code.book.batch.sinksource.scala
import org.apache.flink.api.common.operators.Order
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
import org.apache.flink.core.fs.FileSystem.WriteMode
object DataSink002 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val stu: DataSet[(Int, String, Double)] = env.fromElements(
(19, "zhangsan", 178.8),
(17, "lisi", 168.8),
(18, "wangwu", 184.8),
(21, "zhaoliu", 164.8)
)
stu.sortPartition(0, Order.ASCENDING).print
stu.sortPartition(1, Order.DESCENDING).print
stu.sortPartition(0, Order.ASCENDING).sortPartition(2, Order.DESCENDING).print
stu.sortPartition("_", Order.ASCENDING).print
case class Student(name: String, age: Int)
val ds1: DataSet[(Student, Double)] = env.fromElements(
(Student("zhangsan", 18), 178.5),
(Student("lisi", 19), 176.5),
(Student("wangwu", 17), 168.5)
)
val ds2 = ds1.sortPartition("_1.age", Order.ASCENDING).setParallelism(1)
val outPath1="hdfs:///output/flink/datasink/Student001.txt"
ds2.writeAsText(filePath = outPath1, WriteMode.OVERWRITE)
env.execute()
val outPath2="hdfs:///output/flink/datasink/Student002.csv"
ds2.writeAsCsv(filePath = outPath2,rowDelimiter = "\n",
fieldDelimiter = "|||",WriteMode.OVERWRITE)
env.execute()
}
}