一、flink在批處理中常見的source
flink在批處理中常見的source主要有兩大類。
1.基於本地集合的source(Collection-based-source)
2.基於檔案的source(File-based-source)
1.基於本地集合的source
在flink最常見的建立DataSet方式有三種。
1.使用env.fromElements(),這種方式也支援Tuple,自定義物件等複合形式。
2.使用env.fromCollection(),這種方式支援多種Collection的具體型別
3.使用env.generateSequence()方法建立基於Sequence的DataSet
執行程式
package code.book.batch.sinksource.scala
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _}
import scala.collection.immutable.{Queue, Stack}
import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
object DataSource001 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val ds0: DataSet[String] = env.fromElements("spark", "flink")
ds0.print()
val ds1: DataSet[(Int, String)] = env.fromElements((1, "spark"), (2, "flink"))
ds1.print()
val ds2: DataSet[String] = env.fromCollection(Array("spark", "flink"))
ds2.print()
val ds3: DataSet[String] = env.fromCollection(ArrayBuffer("spark", "flink"))
ds3.print()
val ds4: DataSet[String] = env.fromCollection(List("spark", "flink"))
ds4.print()
val ds5: DataSet[String] = env.fromCollection(ListBuffer("spark", "flink"))
ds5.print()
val ds6: DataSet[String] = env.fromCollection(Vector("spark", "flink"))
ds6.print()
val ds7: DataSet[String] = env.fromCollection(Queue("spark", "flink"))
ds7.print()
val ds8: DataSet[String] = env.fromCollection(Stack("spark", "flink"))
ds8.print()
val ds9: DataSet[String] = env.fromCollection(Stream("spark", "flink"))
ds9.print()
val ds10: DataSet[String] = env.fromCollection(Seq("spark", "flink"))
ds10.print()
val ds11: DataSet[String] = env.fromCollection(Set("spark", "flink"))
ds11.print()
val ds12: DataSet[String] = env.fromCollection(Iterable("spark", "flink"))
ds12.print()
val ds13: DataSet[String] = env.fromCollection(mutable.ArraySeq("spark", "flink"))
ds13.print()
val ds14: DataSet[String] = env.fromCollection(mutable.ArrayStack("spark", "flink"))
ds14.print()
val ds15: DataSet[(Int, String)] = env.fromCollection(Map(1 -> "spark", 2 -> "flink"))
ds15.print()
val ds16: DataSet[Int] = env.fromCollection(Range(1, 9))
ds16.print()
val ds17: DataSet[Long] = env.generateSequence(1,9)
ds17.print()
}
}
2.基於檔案的source(File-based-source)
flink支援多種儲存裝置上的檔案,包括本地檔案,hdfs檔案,alluxio檔案等。
flink支援多種檔案的儲存格式,包括text檔案,CSV檔案等。
執行程式
package code.book.batch.sinksource.scala
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment,_}
object DataSource002 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val ds1: DataSet[String] = env.readTextFile("file:///Applications/flink-1.1.3/README.txt")
ds1.print()
val ds2: DataSet[String] = env.readTextFile("hdfs:///input/flink/README.txt")
ds2.print()
val path = "hdfs://qingcheng11:9000/input/flink/sales.csv"
val ds3 = env.readCsvFile[(String, Int, Int, Double)](
filePath = path,
lineDelimiter = "\n",
fieldDelimiter = ",",
lenient = false,
ignoreFirstLine = true,
includedFields = Array(0, 1, 2, 3))
ds3.print()
case class Sales(transactionId: String, customerId: Int, itemId: Int, amountPaid: Double)
val ds4 = env.readCsvFile[Sales](
filePath = path,
lineDelimiter = "\n",
fieldDelimiter = ",",
lenient = false,
ignoreFirstLine = true,
includedFields = Array(0, 1, 2, 3),
pojoFields = Array("transactionId", "customerId", "itemId", "amountPaid")
)
ds4.print()
}
}
3.基於檔案的source(遍歷目錄)
flink支援對一個檔案目錄內的所有檔案,包括所有子目錄中的所有檔案的遍歷訪問方式。
執行程式
package code.book.batch.sinksource.scala
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.configuration.Configuration
/**
* 遞迴讀取hdfs目錄中的所有檔案,會遍歷各級子目錄
*/
object DataSource003 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val parameters = new Configuration
parameters.setBoolean("recursive.file.enumeration", true)
val ds1 = env.readTextFile("hdfs:///input/flink").withParameters(parameters)
ds1.print()
}
}