【雲星資料---Apache Flink實戰系列(精品版)】:Apache Flink高階特性與高階應用014-Flink在批處理中常見的sink和source002

雲星資料-大資料團隊發表於2017-11-19

二、flink在批處理中常見的sink

1.基於本地集合的sink(Collection-based-sink2.基於檔案的sink(File-based-sink

1.基於本地集合的sink(Collection-based-sink)

package code.book.batch.sinksource.scala
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
object DataSink000 {
  def main(args: Array[String]): Unit = {
    //1.定義環境
    val env = ExecutionEnvironment.getExecutionEnvironment
    //2.定義資料 stu(age,name,height)
    val stu: DataSet[(Int, String, Double)] = env.fromElements(
      (19, "zhangsan", 178.8),
      (17, "lisi", 168.8),
      (18, "wangwu", 184.8),
      (21, "zhaoliu", 164.8)
    )
    //3.sink到標準輸出
    stu.print

    //3.sink到標準error輸出
    stu.printToErr()

    //4.sink到本地Collection
    print(stu.collect())
  }
}

2.基於檔案的sink(File-based-sink)

flink支援多種儲存裝置上的檔案,包括本地檔案,hdfs檔案,alluxio檔案等。
flink支援多種檔案的儲存格式,包括text檔案,CSV檔案等。
package code.book.batch.sinksource.scala

import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
import org.apache.flink.core.fs.FileSystem.WriteMode
object DataSink001 {
  def main(args: Array[String]): Unit = {
    //0.主意:不論是本地還是hdfs.若Parallelism>1將把path當成目錄名稱,若Parallelism=1將把path當成檔名。
    val env = ExecutionEnvironment.getExecutionEnvironment
    val ds1: DataSet[(Int, String)] = env.fromCollection(Map(1 -> "spark", 2 -> "flink"))
    //1.寫入到本地,文字文件,NO_OVERWRITE模式下如果檔案已經存在,則報錯,OVERWRITE模式下如果檔案已經存在,則覆蓋
    ds1.setParallelism(1).writeAsText("file:///output/flink/datasink/test001.txt",
    WriteMode.OVERWRITE)
    env.execute()

    //2.寫入到hdfs,文字文件,路徑不存在則自動建立路徑。
    ds1.setParallelism(1).writeAsText("hdfs:///output/flink/datasink/test001.txt",
    WriteMode.OVERWRITE)
    env.execute()

    //3.寫入到hdfs,CSV文件
    //3.1讀取csv檔案
    val inPath = "hdfs:///input/flink/sales.csv"
    case class Sales(transactionId: String, customerId: Int, itemId: Int, amountPaid: Double)
    val ds2 = env.readCsvFile[Sales](
      filePath = inPath,
      lineDelimiter = "\n",
      fieldDelimiter = ",",
      lenient = false,
      ignoreFirstLine = true,
      includedFields = Array(0, 1, 2, 3),
      pojoFields = Array("transactionId", "customerId", "itemId", "amountPaid")
    )
    //3.2將CSV文件寫入到hdfs
    val outPath = "hdfs:///output/flink/datasink/sales.csv"
    ds2.setParallelism(1).writeAsCsv(filePath = outPath, rowDelimiter = "\n",
    fieldDelimiter = "|", WriteMode.OVERWRITE)
    env.execute()
  }
}

3.基於檔案的sink(資料進行排序)

可以使用sortPartition對資料進行排序後再sink到外部系統。

執行程式

package code.book.batch.sinksource.scala
import org.apache.flink.api.common.operators.Order
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
import org.apache.flink.core.fs.FileSystem.WriteMode
object DataSink002 {
  def main(args: Array[String]): Unit = {
    val env = ExecutionEnvironment.getExecutionEnvironment
    //stu(age,name,height)
    val stu: DataSet[(Int, String, Double)] = env.fromElements(
      (19, "zhangsan", 178.8),
      (17, "lisi", 168.8),
      (18, "wangwu", 184.8),
      (21, "zhaoliu", 164.8)
    )
    //1.以age從小到大升序排列(0->9)
    stu.sortPartition(0, Order.ASCENDING).print
    //2.以naem從大到小降序排列(z->a)
    stu.sortPartition(1, Order.DESCENDING).print
    //3.以age升序,height降序排列
    stu.sortPartition(0, Order.ASCENDING).sortPartition(2, Order.DESCENDING).print
    //4.所有欄位升序排列
    stu.sortPartition("_", Order.ASCENDING).print
    //5.以Student.name升序
    //5.1準備資料
    case class Student(name: String, age: Int)
    val ds1: DataSet[(Student, Double)] = env.fromElements(
      (Student("zhangsan", 18), 178.5),
      (Student("lisi", 19), 176.5),
      (Student("wangwu", 17), 168.5)
    )
    val ds2 = ds1.sortPartition("_1.age", Order.ASCENDING).setParallelism(1)
    //5.2寫入到hdfs,文字文件
    val outPath1="hdfs:///output/flink/datasink/Student001.txt"
    ds2.writeAsText(filePath = outPath1, WriteMode.OVERWRITE)
    env.execute()
    //5.3寫入到hdfs,CSV文件
    val outPath2="hdfs:///output/flink/datasink/Student002.csv"
    ds2.writeAsCsv(filePath = outPath2,rowDelimiter = "\n",
    fieldDelimiter = "|||",WriteMode.OVERWRITE)
    env.execute()
  }
}

相關文章