1.概述
在《Kafka實戰-Flume到Kafka》一文中給大家分享了Kafka的資料來源生產,今天為大家介紹如何去實時消費Kafka中的資料。這裡使用實時計算的模型——Storm。下面是今天分享的主要內容,如下所示:
- 資料消費
- Storm計算
- 預覽截圖
接下來,我們開始分享今天的內容。
2.資料消費
Kafka的資料消費,是由Storm去消費,通過KafkaSpout將資料輸送到Storm,然後讓Storm安裝業務需求對接受的資料做實時處理,下面給大家介紹資料消費的流程圖,如下圖所示:
從圖可以看出,Storm通過KafkaSpout獲取Kafka叢集中的資料,在經過Storm處理後,結果會被持久化到DB庫中。
3.Storm計算
接著,我們使用Storm去計算,這裡需要體檢搭建部署好Storm叢集,若是未搭建部署叢集,大家可以參考我寫的《Kafka實戰-Storm Cluster》。這裡就不多做贅述搭建的過程了,下面給大家介紹實現這部分的程式碼,關於KafkaSpout的程式碼如下所示:
- KafkaSpout類:
package cn.hadoop.hdfs.storm; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import cn.hadoop.hdfs.conf.ConfigureAPI.KafkaProperties; import kafka.consumer.Consumer; import kafka.consumer.ConsumerConfig; import kafka.consumer.ConsumerIterator; import kafka.consumer.KafkaStream; import kafka.javaapi.consumer.ConsumerConnector; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IRichSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; /** * @Date Jun 10, 2015 * * @Author dengjie * * @Note Data sources using KafkaSpout to consume Kafka */ public class KafkaSpout implements IRichSpout { /** * */ private static final long serialVersionUID = -7107773519958260350L; private static final Logger LOGGER = LoggerFactory.getLogger(KafkaSpout.class); SpoutOutputCollector collector; private ConsumerConnector consumer; private String topic; private static ConsumerConfig createConsumerConfig() { Properties props = new Properties(); props.put("zookeeper.connect", KafkaProperties.ZK); props.put("group.id", KafkaProperties.GROUP_ID); props.put("zookeeper.session.timeout.ms", "40000"); props.put("zookeeper.sync.time.ms", "200"); props.put("auto.commit.interval.ms", "1000"); return new ConsumerConfig(props); } public KafkaSpout(String topic) { this.topic = topic; } public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; } public void close() { // TODO Auto-generated method stub } public void activate() { this.consumer = Consumer.createJavaConsumerConnector(createConsumerConfig()); Map<String, Integer> topickMap = new HashMap<String, Integer>(); topickMap.put(topic, new Integer(1)); Map<String, List<KafkaStream<byte[], byte[]>>> streamMap = consumer.createMessageStreams(topickMap); KafkaStream<byte[], byte[]> stream = streamMap.get(topic).get(0); ConsumerIterator<byte[], byte[]> it = stream.iterator(); while (it.hasNext()) { String value = new String(it.next().message()); LOGGER.info("(consumer)==>" + value); collector.emit(new Values(value), value); } } public void deactivate() { // TODO Auto-generated method stub } public void nextTuple() { // TODO Auto-generated method stub } public void ack(Object msgId) { // TODO Auto-generated method stub } public void fail(Object msgId) { // TODO Auto-generated method stub } public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("KafkaSpout")); } public Map<String, Object> getComponentConfiguration() { // TODO Auto-generated method stub return null; } }
- KafkaTopology類:
package cn.hadoop.hdfs.storm.client; import cn.hadoop.hdfs.storm.FileBlots; import cn.hadoop.hdfs.storm.KafkaSpout; import cn.hadoop.hdfs.storm.WordsCounterBlots; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.topology.TopologyBuilder; import backtype.storm.tuple.Fields; /** * @Date Jun 10, 2015 * * @Author dengjie * * @Note KafkaTopology Task */ public class KafkaTopology { public static void main(String[] args) { TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("testGroup", new KafkaSpout("test")); builder.setBolt("file-blots", new FileBlots()).shuffleGrouping("testGroup"); builder.setBolt("words-counter", new WordsCounterBlots(), 2).fieldsGrouping("file-blots", new Fields("words")); Config config = new Config(); config.setDebug(true); if (args != null && args.length > 0) { // online commit Topology config.put(Config.NIMBUS_HOST, args[0]); config.setNumWorkers(3); try { StormSubmitter.submitTopologyWithProgressBar(KafkaTopology.class.getSimpleName(), config, builder.createTopology()); } catch (Exception e) { e.printStackTrace(); } } else { // Local commit jar LocalCluster local = new LocalCluster(); local.submitTopology("counter", config, builder.createTopology()); try { Thread.sleep(60000); } catch (InterruptedException e) { e.printStackTrace(); } local.shutdown(); } } }
4.預覽截圖
首先,我們啟動Kafka叢集,目前未生產任何訊息,如下圖所示:
接下來,我們啟動Flume叢集,開始收集日誌資訊,將資料輸送到Kafka叢集,如下圖所示:
接下來,我們啟動Storm UI來檢視Storm提交的任務執行狀況,如下圖所示:
最後,將統計的結果持久化到Redis或者MySQL等DB中,結果如下圖所示:
5.總結
這裡給大家分享了資料的消費流程,並且給出了持久化的結果預覽圖,關於持久化的細節,後面有單獨有一篇部落格會詳細的講述,給大家分享其中的過程,這裡大家熟悉下流程,預覽結果即可。
6.結束語
這篇部落格就和大家分享到這裡,如果大家在研究學習的過程當中有什麼問題,可以加群進行討論或傳送郵件給我,我會盡我所能為您解答,與君共勉!