spark 與flume 1.6.0

hgs19921112發表於2018-10-18
package hgs.spark.streaming
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.storage.StorageLevel
import org.apache.spark.HashPartitioner
/*	pom.xml中加入如下配置
 * 	<dependency>
 
    		<groupId>org.apache.spark</groupId>
    		<artifactId>spark-streaming-flume_2.11</artifactId>
   			<version>2.1.0</version>
		</dependency>
		* 
		*/
/*flume的conf檔案
a1.sources=r1
a1.sinks=k1
a1.channels=c1
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir=/home/logs
a1.sources.r1.fileHeader=true
a1.sinks.k1.type=avro
a1.sinks.k1.hostname= 192.168.1.9
a1.sinks.k1.port= 8888
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
#the command to start a agent
#bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template
*/
object SparkStreamingFlumePush {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("flume-push").setMaster("local[2]");
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(5))
    ssc.checkpoint("d:\\checkpoint")
    val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{
    //iter.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map((it._1,_)))//方式一
    //iter.flatMap{case(x,y,z)=>{Some(y.sum+z.getOrElse(0)).map((x,_))}}//方式二
    iter.flatMap(it=>Some(it._1,(it._2.sum.toInt+it._3.getOrElse(0))))//方式三
    }
    //總共有兩種獲取資料的方式,push和poll,這種是push即flume將資料推送給spark 該出的ip、port是spark的ip地址和port
    val rds = FlumeUtils.createStream(ssc, "192.168.1.9", 8888, StorageLevel.MEMORY_ONLY)
    val result = rds.flatMap(x=>(new String(x.event.getBody.array())).split(" "))
    .map(x=>(x,1))
    .updateStateByKey(updateFunc, new HashPartitioner(sc.defaultMinPartitions), true)
    
    result.print()
    
    ssc.start()
    ssc.awaitTermination()
    
    
  }
}
package hgs.spark.streaming
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.flume.FlumeUtils
import java.net.InetAddress
import java.net.InetSocketAddress
import org.apache.spark.storage.StorageLevel
import org.apache.spark.HashPartitioner
//spark支援1.6.0的flume版本
/*	pom.xml中加入如下配置
 * 	<dependency>
 
    		<groupId>org.apache.spark</groupId>
    		<artifactId>spark-streaming-flume_2.11</artifactId>
   			<version>2.1.0</version>
		</dependency>
		* 
		*/
/*
 * flume配置
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir = /home/logs
a1.sources.r1.fileHeader = true
a1.sinks.k1.type=org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.k1.hostname=192.168.6.129
a1.sinks.k1.port = 8888
a1.channels.c1.type=memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity=100
a1.sources.r1.channels=c1
a1.sinks.k1.channel = c1
#the command to start a agent
#bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template
*/
//同時需要如下三個包 將三個包放到flume的classpath下面
/* groupId = org.apache.spark
 artifactId = spark-streaming-flume-sink_2.11
 version = 2.1.0
 
 groupId = org.scala-lang
 artifactId = scala-library
 version = 2.11.7
 
 groupId = org.apache.commons
 artifactId = commons-lang3
 version = 3.5*/
 
object SparkStreamingFlumePoll {
  def main(args: Array[String]): Unit = {
   val conf = new SparkConf().setAppName("flume-push").setMaster("local[2]");
   val sc = new SparkContext(conf)
   val ssc = new StreamingContext(sc,Seconds(5))
   ssc.checkpoint("d:\\checkpoint")
   val ipSeq =  Seq(new InetSocketAddress("192.168.6.129",8888))
   //這種方式透過spark從flume拉取資料
   val rds = FlumeUtils.createPollingStream(ssc, ipSeq, StorageLevel.MEMORY_AND_DISK)
   
   val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{
    //iter.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map((it._1,_)))//方式一
    //iter.flatMap{case(x,y,z)=>{Some(y.sum+z.getOrElse(0)).map((x,_))}}//方式二
    iter.flatMap(it=>Some(it._1,(it._2.sum.toInt+it._3.getOrElse(0))))//方式三
    }
   val result = rds.flatMap(x=>(new String(x.event.getBody.array())).split(" "))
    .map(x=>(x,1))
    .updateStateByKey(updateFunc, new HashPartitioner(sc.defaultMinPartitions), true)
    
    result.print()
    
    ssc.start()
    ssc.awaitTermination()
  }
}
//遇到的錯誤 scala-library包在flume 的lib下面本來就有,包重複導致的衝突,刪除一個
/*18 Oct 2018 20:58:32,123 WARN  [Spark Sink Processor Thread - 10] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:80)  - Error while processing transaction.
java.lang.IllegalStateException: begin() called when transaction is OPEN!
	at com.google.common.base.Preconditions.checkState(Preconditions.java:145)
	at org.apache.flume.channel.BasicTransactionSemantics.begin(BasicTransactionSemantics.java:131)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor$$anonfun$populateEvents$1.apply(TransactionProcessor.scala:114)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor$$anonfun$populateEvents$1.apply(TransactionProcessor.scala:113)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor.populateEvents(TransactionProcessor.scala:113)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor.call(TransactionProcessor.scala:243)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor.call(TransactionProcessor.scala:43)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
18 Oct 2018 20:58:32,128 WARN  [Spark Sink Processor Thread - 10] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:59)  - Spark was unable to successfully process the events. Transaction is being rolled back.
18 Oct 2018 20:58:32,128 WARN  [New I/O  worker #1] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:59)  - Received an error batch - no events were received from channel! */


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31506529/viewspace-2216840/,如需轉載,請註明出處,否則將追究法律責任。

相關文章