spark 與flume 1.6.0
package hgs.spark.streaming import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.flume.FlumeUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.HashPartitioner /* pom.xml中加入如下配置 * <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-flume_2.11</artifactId> <version>2.1.0</version> </dependency> * */ /*flume的conf檔案 a1.sources=r1 a1.sinks=k1 a1.channels=c1 a1.sources.r1.type=spooldir a1.sources.r1.spoolDir=/home/logs a1.sources.r1.fileHeader=true a1.sinks.k1.type=avro a1.sinks.k1.hostname= 192.168.1.9 a1.sinks.k1.port= 8888 a1.channels.c1.type=memory a1.channels.c1.capacity=1000 a1.channels.c1.transactionCapacity=100 a1.sources.r1.channels=c1 a1.sinks.k1.channel=c1 #the command to start a agent #bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template */ object SparkStreamingFlumePush { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("flume-push").setMaster("local[2]"); val sc = new SparkContext(conf) val ssc = new StreamingContext(sc,Seconds(5)) ssc.checkpoint("d:\\checkpoint") val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{ //iter.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map((it._1,_)))//方式一 //iter.flatMap{case(x,y,z)=>{Some(y.sum+z.getOrElse(0)).map((x,_))}}//方式二 iter.flatMap(it=>Some(it._1,(it._2.sum.toInt+it._3.getOrElse(0))))//方式三 } //總共有兩種獲取資料的方式,push和poll,這種是push即flume將資料推送給spark 該出的ip、port是spark的ip地址和port val rds = FlumeUtils.createStream(ssc, "192.168.1.9", 8888, StorageLevel.MEMORY_ONLY) val result = rds.flatMap(x=>(new String(x.event.getBody.array())).split(" ")) .map(x=>(x,1)) .updateStateByKey(updateFunc, new HashPartitioner(sc.defaultMinPartitions), true) result.print() ssc.start() ssc.awaitTermination() } }
package hgs.spark.streaming import org.apache.spark.streaming.StreamingContext import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.flume.FlumeUtils import java.net.InetAddress import java.net.InetSocketAddress import org.apache.spark.storage.StorageLevel import org.apache.spark.HashPartitioner //spark支援1.6.0的flume版本 /* pom.xml中加入如下配置 * <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-flume_2.11</artifactId> <version>2.1.0</version> </dependency> * */ /* * flume配置 a1.sources = r1 a1.sinks = k1 a1.channels = c1 a1.sources.r1.type=spooldir a1.sources.r1.spoolDir = /home/logs a1.sources.r1.fileHeader = true a1.sinks.k1.type=org.apache.spark.streaming.flume.sink.SparkSink a1.sinks.k1.hostname=192.168.6.129 a1.sinks.k1.port = 8888 a1.channels.c1.type=memory a1.channels.c1.capacity = 1000 a1.channels.c1.transactionCapacity=100 a1.sources.r1.channels=c1 a1.sinks.k1.channel = c1 #the command to start a agent #bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template */ //同時需要如下三個包 將三個包放到flume的classpath下面 /* groupId = org.apache.spark artifactId = spark-streaming-flume-sink_2.11 version = 2.1.0 groupId = org.scala-lang artifactId = scala-library version = 2.11.7 groupId = org.apache.commons artifactId = commons-lang3 version = 3.5*/ object SparkStreamingFlumePoll { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("flume-push").setMaster("local[2]"); val sc = new SparkContext(conf) val ssc = new StreamingContext(sc,Seconds(5)) ssc.checkpoint("d:\\checkpoint") val ipSeq = Seq(new InetSocketAddress("192.168.6.129",8888)) //這種方式透過spark從flume拉取資料 val rds = FlumeUtils.createPollingStream(ssc, ipSeq, StorageLevel.MEMORY_AND_DISK) val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{ //iter.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map((it._1,_)))//方式一 //iter.flatMap{case(x,y,z)=>{Some(y.sum+z.getOrElse(0)).map((x,_))}}//方式二 iter.flatMap(it=>Some(it._1,(it._2.sum.toInt+it._3.getOrElse(0))))//方式三 } val result = rds.flatMap(x=>(new String(x.event.getBody.array())).split(" ")) .map(x=>(x,1)) .updateStateByKey(updateFunc, new HashPartitioner(sc.defaultMinPartitions), true) result.print() ssc.start() ssc.awaitTermination() } } //遇到的錯誤 scala-library包在flume 的lib下面本來就有,包重複導致的衝突,刪除一個 /*18 Oct 2018 20:58:32,123 WARN [Spark Sink Processor Thread - 10] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:80) - Error while processing transaction. java.lang.IllegalStateException: begin() called when transaction is OPEN! at com.google.common.base.Preconditions.checkState(Preconditions.java:145) at org.apache.flume.channel.BasicTransactionSemantics.begin(BasicTransactionSemantics.java:131) at org.apache.spark.streaming.flume.sink.TransactionProcessor$$anonfun$populateEvents$1.apply(TransactionProcessor.scala:114) at org.apache.spark.streaming.flume.sink.TransactionProcessor$$anonfun$populateEvents$1.apply(TransactionProcessor.scala:113) at scala.Option.foreach(Option.scala:236) at org.apache.spark.streaming.flume.sink.TransactionProcessor.populateEvents(TransactionProcessor.scala:113) at org.apache.spark.streaming.flume.sink.TransactionProcessor.call(TransactionProcessor.scala:243) at org.apache.spark.streaming.flume.sink.TransactionProcessor.call(TransactionProcessor.scala:43) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 18 Oct 2018 20:58:32,128 WARN [Spark Sink Processor Thread - 10] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:59) - Spark was unable to successfully process the events. Transaction is being rolled back. 18 Oct 2018 20:58:32,128 WARN [New I/O worker #1] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:59) - Received an error batch - no events were received from channel! */
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31506529/viewspace-2216840/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- Spark 系列(十五)—— Spark Streaming 整合 FlumeSpark
- Flume:spark-project專案的flume配置SparkProject
- 大資料基礎學習-5.Flume1.6.0大資料
- Flume+Spark+Hive+Spark SQL離線分析系統SparkHiveSQL
- Apache Sqoop與Apache Flume比較ApacheOOP
- flume
- spark與hbaseSpark
- Flume概述
- flume + elasticsearchElasticsearch
- Flume 整合 Kafka_flume 到kafka 配置【轉】Kafka
- spark與kafaka整合workcount示例 spark-stream-kafkaSparkKafka
- Hadoop與Spark關係HadoopSpark
- spark 與 yarn 結合SparkYarn
- Spark安裝與配置Spark
- spark學習筆記--Spark調優與除錯Spark筆記除錯
- Apache Kyuubi 1.6.0 新特性解讀Apache
- Spark Streaming的PIDRateEstimator與backpressureSpark
- Spark GraphX簡介與教程Spark
- flume線上配置
- HomeBrew 安裝 yarn 1.6.0 版本失敗Yarn
- BeeHive 1.6.0 原始碼閱讀探討Hive原始碼
- Flume學習系列(六)---- Logger Sink原始碼解讀與自定原始碼
- Spark 安裝部署與快速上手Spark
- Spark Connector Reader 原理與實踐Spark
- Apache Flume 入門教程Apache
- Flume採集到HDFS
- Flume - [02] Spooling Directory Source
- Flume基礎學習
- flume的安裝部署
- kafka+flume的整合Kafka
- Flume面試題整理面試題
- Flume監控之Ganglia
- Spark Streaming(六):快取與持久化Spark快取持久化
- Spark SQL / Catalyst 內部原理 與 RBOSparkSQL
- Spark與MapReduce的對比(區別)Spark
- Spark:Yarn-client與Yarn-clusterSparkYarnclient
- Spark SQL知識點與實戰SparkSQL
- 針對flume中扇出複用(源exec)原始碼修改,並編譯flume原始碼編譯