一、需求
从指定时间戳(比如 2 小时前)开始消费 Kafka 数据
二、思路
我们知道通过 Kafka 的 API 可以得到指定时间戳对应数据所在的 segment 的起始 offset。那么就可以通过这个功能来粗略的实现需求。
三、实现
我们知道 KafkaUitls.createDirectStream 这个接口可以指定起始点的 offset,那么我们需要做的就变成如下三步:
- 获取 topic 对应的 TopicAndPartitions ,得到当前 topic 有多少 partition
- 从 Kafka 获取每个 partition 指定时间戳所在 segment 的起始 offset
- 将步骤 2 中的 offset 作为参数传入 createDirectStream 即可
package com.ruozedata.bigdata.spark.streaming01
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
import scala.collection.JavaConverters._
import scala.collection.mutable
object SparkStreamingWithTimestamp {
def main(args: Array[String]): Unit = {
if (args.length > 1) {
System.err.println(
s"""
|Usage: SparkStreamingWithTimestamp [datetime]
| [datetime] is a kafka offset datetime.The format is yyyy-MM-dd hh:mm:ss
|
""".stripMargin)
System.exit(1)
}
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(conf, Seconds(10))
conf.registerKryoClasses(Array(classOf[ConsumerRecord[String,String]]))
ssc.sparkContext.setLogLevel("WARN")
val topicsSet = "test".split(",").toSet
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.174.120:9092",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> "test",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
)
var messages: InputDStream[ConsumerRecord[String, String]] = null
if (args.length == 1) {
messages = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams, getOffsetByTimestamp(kafkaParams, args(0))))
} else {
messages = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Seq("test"), kafkaParams))
}
messages.print()
ssc.start()
ssc.awaitTermination()
}
/**
* 根据时间 获取kafka的offset
*
* @param kafkaParams
* @param time
* @return
*/
def getOffsetByTimestamp(kafkaParams: collection.Map[String, Object], time: String): mutable.HashMap[TopicPartition, Long] = {
val consumer = new KafkaConsumer[String, String](new java.util.HashMap[String, Object](kafkaParams.asJava))
val fetchTime = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").parseMillis(time)
//记录(topic,分区) --->对应时间戳
val timestampToSearch: java.util.Map[TopicPartition, java.lang.Long] = new java.util.HashMap[TopicPartition, java.lang.Long]()
//记录分区和他对应的offset
val partitionOffset = new mutable.HashMap[TopicPartition, Long]
//获取topic的partition信息 可以得到这个topic的所有partition 返回值是一个list[PartitionInfo]
val partitionInfos = consumer.partitionsFor("test")
for (partitionInfo <- partitionInfos.asScala) {
val tp = new TopicPartition(partitionInfo.topic(), partitionInfo.partition());
timestampToSearch.put(tp, fetchTime)
}
val topicPartitionToOffsetAndTimestamp = consumer.offsetsForTimes(timestampToSearch)
for ((tp, offsetAndTimeStamp) <- topicPartitionToOffsetAndTimestamp.asScala) {
val offset = offsetAndTimeStamp.offset()
partitionOffset+=tp->offset
}
consumer.close()
partitionOffset
}
}