[TOC]
基于flume + kafka + spark Streaming的流式处理系统
# flume
### 创建配置文件,spark_kafka_agent为agent的名字,接收/Users/zhangyaxing/eclipse-workspace/kafka_spark/spark_kafka.log的数据
spark_kafka_agent.sources = s1
spark_kafka_agent.sinks = k1
spark_kafka_agent.channels = c1
# spark_kafka_agent : name of agent
# Describe/configure the source
spark_kafka_agent.sources.s1.type=exec
spark_kafka_agent.sources.s1.command=tail -F /Users/zhangyaxing/eclipse-workspace/kafka_spark/spark_kafka.log
spark_kafka_agent.sources.s1.channels=c1
# Describe the sink
spark_kafka_agent.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
spark_kafka_agent.sinks.k1.topic = spark_kafka_topic
spark_kafka_agent.sinks.k1.brokerList = localhost:9092,localhost:9093,localhost:9094
agent.sinks.k1.serializer.class=kafka.serializer.StringEncoder
spark_kafka_agent.sinks.k1.channel = c1
# Use a channel which buffers events in memory
spark_kafka_agent.channels.c1.type = memory
spark_kafka_agent.channels.c1.capacity = 10000
spark_kafka_agent.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
spark_kafka_agent.sources.r1.channels = c1
spark_kafka_agent.sinks.k1.channel = c1
### 启动flume连接到kafka
flume-ng agent -n spark_kafka_agent -c conf/ -f conf/spark_kafka.properties -Dflume.root.logger=INFO,console
# kafka
### 配置文件
仅仅修改server.properties文件,把\config\server.properties文件复制2份,分别是server.properties,server_1.properties,server_2.properties,
0.文件server.properties
broker.id=0
listeners=PLAINTEXT://localhost:9092
log.dirs=/tmp/kafka-logs-0
num.partitions=3
zookeeper.connect=localhost:2184,localhost:2182,localhost:2183
1.文件server_1.properties
broker.id=1
listeners=PLAINTEXT://localhost:9093
log.dirs=/tmp/kafka-logs-1
num.partitions=3
zookeeper.connect=localhost:2184,localhost:2182,localhost:2183
2.文件server_2.properties
broker.id=2
listeners=PLAINTEXT://localhost:9094
log.dirs=/tmp/kafka-logs-2
num.partitions=3
zookeeper.connect=localhost:2184,localhost:2182,localhost:2183
### 启动kafka broker
bin/kafka-server-start.sh -daemon config/server.properties
bin/kafka-server-start.sh -daemon config/server_1.properties
bin/kafka-server-start.sh -daemon config/server_2.properties
### 创建topic
bin/kafka-topics.sh --create --zookeeper localhost:2182 --replication-factor 3 --partitions 1 --topic spark_kafka_topic
### 创建后,使用 –describe 来查看一下
bin/kafka-topics.sh --describe --zookeeper localhost:2182 --topic spark_kafka_topic
### 启动kafka消费者接受flume数据
kafka-console-consumer.sh --zookeeper localhost:2182,localhost:2183,localhost:2184 --topic spark_kafka_topic --from-beginning
# kafka -----> spark streaming
### 什么是Spark Streaming
>流式处理是把连续不断的数据输入分割成单元数据块来处理。
Spark Streaming对Spark核心API进行了相应的扩展,支持高吞吐、低延迟、可扩展的流式数据处理。
### 自己管理offset
>为了让Spark Streaming消费kafka的数据不丢数据,可以创建Kafka Direct DStream,由Spark Streaming自己管理offset,并不是存到zookeeper。启用Spark Streaming的 checkpoints是存储偏移量的最简单方法,因为它可以在Spark的框架内轻松获得。 checkpoints将应用程序的状态保存到HDFS,以便在故障时可以恢复。如果发生故障,Spark Streaming应用程序可以从checkpoints偏移范围读取消息。但是,Spark Streaming checkpoints在应用程序修改后由于从checkpoint反序列化失败而无法恢复,因此不是非常可靠,特别是如果您将此机制用于关键生产应用程序,另外,基于zookeeper的offset可视化工具将无法使用。我们不建议通过Spark checkpoints来管理偏移量。因此本文将手动存储offset到zookeeper,完全自我掌控offset。
### 编写代码
KafkaManager.scala
```scala
package org.apache.spark.streaming.kafka
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
import scala.reflect.ClassTag
class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable {
// KafkaCluster in Spark is overwrited by myself
private val kc = new KafkaCluster(kafkaParams)
//根据offset创建DStream
def createDirectStream[K: ClassTag,
V: ClassTag,
KD <: Decoder[K]: ClassTag,
VD <: Decoder[V]: ClassTag
](ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Set[String]): InputDStream[(K, V, String)] = {
val groupId = kafkaParams.get("group.id").get
//在zookeeper上读取offsets前先根据实际情况更新offsets
setOrUpdateOffsets(topics, groupId)
//从zookeeper上读取offset开始消费message
val messages = {
val partitionsE = kc.getPartitions(topics)
if(partitionsE.isLeft)
// s"xx ${}" 字符串插值
throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
val partitions = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if(consumerOffsetsE.isLeft)
throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}")
val consumerOffsets = consumerOffsetsE.right.get
//从指定offsets处消费kafka
//messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key(), mmd.message())
//MessageAndMetadata里包含message的topic message等信息
KafkaUtils.createDirectStream[K, V, KD, VD, (K, V, String)](
ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message, mmd.topic)
)
}
messages
}
private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit ={
topics.foreach(topic => {
var hasConsumerd = true
val partitionsE = kc.getPartitions(Set(topic))
if(partitionsE.isLeft)
throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
val partitions = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if(consumerOffsetsE.isLeft) hasConsumerd = false
//某个groupid首次没有offset信息,会报错从头开始读
if(hasConsumerd){ // 消费过
/**
* 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException
* 说明zk上保存的offsets已经过时,即kafka的定时清理策略已经将包含该offsets的文件删除
* 针对这种情况,只要判断一下zk伤的consumerOffsets和earliestLeaderOffsets的大小
* 如果consumerOffsets比earliestLeaderOffsets小的话,说明consumerOffsets过时
* 这时把consumerOffsets更新为earliestLeaderOffsets
*/
val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
if(earliestLeaderOffsetsE.isLeft)
throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}")
val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
val consumerOffsets = consumerOffsetsE.right.get
//可能只存在部分分区consumerOffsets过时�