val envi = "test1"
val Q_stream = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "Kafka.Server")
.option("subscribe", "topicname_${envi}")
.option("startingOffsets", "latest")
.option("failOnDataLoss", "false")
.option("kafka.group.id", "myConsumerGroup")
.load()
.toDF()
.select(col("key").cast("string").as("key"), col("value").cast("string"))
val value2Stream = Q_stream
.filter(col("key") === "AB")
.select(functions.from_json(col("value"), ABSchema).as("value"))
.select("value.*")
value2Stream.writeStream.format("orc")
.option("metastoreUri", "hive.warehouse.metastoreUri")
.option("checkpointLocation", "/tmp/strlocation_${envi}/AB/checkpointtest1")
.option("path", "/tmp/str2/AB")
.trigger(Trigger.ProcessingTime("5 Seconds"))
.partitionBy("jobid")
.start()
This code is working fine, but it failed when we are subscribing another topic. So requirement is when submit a job in test1 environment, then kafka topic should subscribe topicname_test1. And when submit a job in test2 environment, then kafka topic should subscribe topicname_test2.
I tried this also which is not working.
var isQA: Boolean = false
if (args.length > 0) {
isQA = args(0).toString.trim().toUpperCase == "Q"
getEnvironment = args(1).toString.trim().toUpperCase == "TESTDEVC"
}
var env = ""
var envi = ""
if (isQA) {
configManager.setup("test.conf")
env = "QA"
if (getEnvironment == "TESTDEVC") { // TODO: implement
envi = "TESTDEVC"
} else if (getEnvironment == "TESTDEVC1") {
envi = "TESTDEVC1"
}else if (getEnvironment == "TESTDEVC2") {
envi = "TESTDEVC2"
}
else {
throw new RuntimeException("unknown environment")
}
}
val Q_stream = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "Kafka.Server")
.option("subscribe", "test_${envi}")
.option("startingOffsets", "latest")
.option("failOnDataLoss", "false")
.option("kafka.group.id", "myConsumerGroup")
.load()
.toDF()
.select(col("key").cast("string").as("key"), col("value").cast("string"))
val value2Stream = Q_stream
.filter(col("key") === "AB")
.select(functions.from_json(col("value"), ABSchema).as("value"))
.select("value.*")
value2Stream.writeStream.format("orc")
.option("metastoreUri", "hive.warehouse.metastoreUri")
.option("checkpointLocation", "/tmp/strlocation_${envi}/AB/checkpointtest1")
.option("path", "/tmp/str2/AB")
.trigger(Trigger.ProcessingTime("5 Seconds"))
.partitionBy("jobid")
.start()
subscribe option can only be provided once, and that's not how you do string interpolation in Scala. You're missing the leading s
Sounds like you just need to check your environment, which is a scala question, not related to Spark...
def main(args: Array[String]) {
var env: String
if (getEnvironment() == "env1") {
env = "test1"
} else if (getEnvironment() == "env2") {
env = "test2"
} else {
throw new RuntimeException("unknown environment")
}
val Q_stream = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "Kafka.Server")
.option("subscribe", s"topicname_${env}")
...
}
def getEnvironment() : String {
// TODO: implement
}
Same applies for checkpointLocation
Alternatively, you can provide the environment or topic suffix as an argument to spark-submit - https://spark.apache.org/docs/latest/submitting-applications.html#launching-applications-with-spark-submit
Related
I have a function kafkaIngestion which creates a df from kafkatopic in the following way:
def kafkaIngestion(spark:sparksession):Dataframe = {
val df = spark.read.format("kafka")
.option("kafka.bootstrap.servers", broker)
.option("subscribe", topic)
.option("group.id", grpid)
.load()
.selectExpr("cast(value as string) as data")
.select(from_json($"data", schema=inputSchema)
.as("data")
.select("data.*")
df
}
I am unable to mock the the code to return my expected df. What's the correct way to mock the df?
I would like to apply OneHotEncoder to multiple Columns in my Streaming Dataframe, but I've got the following error.
Any suggestions?
Many thanks!
Exception in thread "main" org.apache.spark.sql.AnalysisException: Queries with streaming sources
must be executed with writeStream.start();;
CODE:
// Read csv
val Stream = spark.read
.format("csv")
.option("header", "true")
.option("delimiter", ";")
.option("header", "true")
.schema(DFschema)
.load("C:/[...]"/
// Kafka
val properties = new Properties()
//val topic = "mongotest"
properties.put("bootstrap.servers", "localhost:9092")
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
Stream.selectExpr("CAST(Col AS STRING) AS KEY",
"to_json(struct(*)) AS value")
.writeStream
.format("kafka")
.option("topic", "predict")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("checkpointLocation", "C:[...]")
.start()
Subscribe to topic
val lines = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "predict")
.load()
val df = Stream
.selectExpr("CAST(value AS STRING)")
val jsons = df.select(from_json($"value", DFschema) as "data").select("data.*")
ETL[...]
Apply funcion Bucketizer() to field
val Msplits = Array(Double.NegativeInfinity,7, 14, 21, Double.PositiveInfinity)
val bucketizerM = new Bucketizer()
.setInputCol("MEASURE")
.setOutputCol("MEASURE_c")
.setSplits(Msplits)
val bucketedData1 = bucketizerD.transform(out)
val bucketedData2 = bucketizerM.transform(bucketedData1) # Works
Error using OneHotEncoder()
val indexer = new StringIndexer()
.setInputCol("CODE")
.setOutputCol("CODE_index")
val encoder = new OneHotEncoder()
.setInputCol("CODE")
.setOutputCol("CODE_encoded")
val vectorAssembler = new VectorAssembler()
.setInputCols(Array("A","B", "CODE_encoded"))
.setOutputCol("features")
val transformationPipeline = new Pipeline()
.setStages(Array(indexer, encoder, vectorAssembler))
val fittedPipeline = transformationPipeline.fit(bucketedData2) # Does't work
We have two InputDStream from two Kafka topics, but we have to join the data of these two input together.
The problem is that each InputDStream is processed independently, because of the foreachRDD, nothing can be returned, to join after.
var Message1ListBuffer = new ListBuffer[Message1]
var Message2ListBuffer = new ListBuffer[Message2]
inputDStream1.foreachRDD(rdd => {
if (!rdd.partitions.isEmpty) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.map({ msg =>
val r = msg.value()
val avro = AvroUtils.objectToAvro(r.getSchema, r)
val messageValue = AvroInputStream.json[FMessage1](avro.getBytes("UTF-8")).singleEntity.get
Message1ListBuffer = Message1FlatMapper.flatmap(messageValue)
Message1ListBuffer
})
inputDStream1.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
})
inputDStream2.foreachRDD(rdd => {
if (!rdd.partitions.isEmpty) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.map({ msg =>
val r = msg.value()
val avro = AvroUtils.objectToAvro(r.getSchema, r)
val messageValue = AvroInputStream.json[FMessage2](avro.getBytes("UTF-8")).singleEntity.get
Message2ListBuffer = Message1FlatMapper.flatmap(messageValue)
Message2ListBuffer
})
inputDStream2.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
})
I thought I could return Message1ListBuffer and Message2ListBuffer, turn them into dataframes and join them. But that does not work, and I do not think it's the best choice
From there, what is the way to return the rdd of each foreachRDD in order to make a join?
inputDStream1.foreachRDD(rdd => {
})
inputDStream2.foreachRDD(rdd => {
})
Not sure about the Spark version you are using, with Spark 2.3+, it can be achieved directly.
With Spark >= 2.3
Subscribe to 2 topics you want to join
val ds1 = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "brokerhost1:port1,brokerhost2:port2")
.option("subscribe", "source-topic1")
.option("startingOffsets", "earliest")
.option("endingOffsets", "latest")
.load
val ds2 = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "brokerhost1:port1,brokerhost2:port2")
.option("subscribe", "source-topic2")
.option("startingOffsets", "earliest")
.option("endingOffsets", "latest")
.load
Format the subscribed messages in both streams
val stream1 = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
.as[(String, String)]
val stream2 = ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
.as[(String, String)]
Join both the streams
resultStream = stream1.join(stream2)
more join operations here
Warning:
Delay records will not get a join match. Need to tweak buffer a bit. more information found here
kafka: kafka_2.11-0.10.2.1
scala:2.12
val TOPIC_EVENT_XXX = "EVENT.xxx.ALL"
import org.apache.spark.sql.Encoders
val schema = Encoders.bean(classOf[Event]).schema
val allEventsDF = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "Streaming01.simon.com:9090,Streaming02.simon.com:9090,Streaming03.simon.com:9090,Streaming04.simon.com:9090")
.option("subscribe", TOPIC_EVENT_XXX)
.option("startingOffsets", "latest")
.option("maxOffsetsPerTrigger", 5000)//old,1000000
.load()
.select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
.selectExpr("parsed_value.*")
val KAFKA_BOOTSTRAP_SERVERS = "Streaming01.simon.com:9090,Streaming02.simon.com:9090,Streaming03.simon.com:9090,Streaming04.simon.com:9090,Ingest01.simon.com:9090,Ingest02.simon.com:9090,Notify01.simon.com:9090,Notify02.simon.com:9090,Serving01.simon.com:9090,Serving02.simon.com:9090,"
var waybillStatesKafkaSinkQuery = waybillStates.selectExpr("to_json(struct(*)) AS value")
.writeStream
.outputMode("append")
.format("kafka") // can be "orc", "json", "csv",memory,console etc.
.option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)
.option("topic", TOPIC_TIMECHAIN_WAYBILL) //TIMECHAIN.WAYBILL.ALL //TIMECHAIN.WAYBILL.TL //TOPIC_TIMECHAIN_WAYBILL
.option("checkpointLocation", CHECKPOINT_PATH_TL_EVENT_WAYBILL_STATES)
.option("kafka.max.request.size", "164217728")//134217728//209715200
.option("kafka.buffer.memory", "164217728")
.option("kafka.timeout.ms",180000)
.option("kafka.request.timeout.ms",180000)
.option("kafka.session.timeout.ms",180000)
.option("kafka.heartbeat.interval.ms",120000)
.option("kafka.retries",100)
.option("failOnDataLoss","false")//后添加的【2018-07-11】
.start()
The following error occurred while running the above program.:
org.apache.kafka.common.errors.NetworkException: The server disconnected before a response was received.
I have the following code to read and process Kafka data using Structured Streaming
object ETLTest {
case class record(value: String, topic: String)
def main(args: Array[String]): Unit = {
run();
}
def run(): Unit = {
val spark = SparkSession
.builder
.appName("Test JOB")
.master("local[*]")
.getOrCreate()
val kafkaStreamingDF = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "...")
.option("subscribe", "...")
.option("failOnDataLoss", "false")
.option("startingOffsets","earliest")
.load()
.selectExpr("CAST(value as STRING)", "CAST(timestamp as STRING)","CAST(topic as STRING)")
val sdvWriter = new ForeachWriter[record] {
def open(partitionId: Long, version: Long): Boolean = {
true
}
def process(record: record) = {
println("record:: " + record)
}
def close(errorOrNull: Throwable): Unit = {}
}
val sdvDF = kafkaStreamingDF
.as[record]
.filter($"value".isNotNull)
// DOES NOT WORK
/*val query = sdvDF
.writeStream
.format("console")
.start()
.awaitTermination()*/
// WORKS
/*val query = sdvDF
.writeStream
.foreach(sdvWriter)
.start()
.awaitTermination()
*/
}
}
I am running this code from IntellijIdea IDE and when I use the foreach(sdvWriter), I could see the records consumed from Kafka, but when I use .writeStream.format("console") I do not see any records. I assume that the console write stream is maintaining some sort of checkpoint and assumes it has processed all the records. Is that the case ? Am I missing something obvious here?
reproduced your code here
both of the options worked. actually in both options without the
import spark.implicits._ it would fail so I'm not sure what you are missing. might be some dependencies configured not correctly. can you add the pom.xml?
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.streaming.Trigger
object Check {
case class record(value: String, topic: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder().master("local[2]")
.getOrCreate
import spark.implicits._
val kafkaStreamingDF = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "test")
.option("startingOffsets","earliest")
.option("failOnDataLoss", "false")
.load()
.selectExpr("CAST(value as STRING)", "CAST(timestamp as STRING)","CAST(topic as STRING)")
val sdvDF = kafkaStreamingDF
.as[record]
.filter($"value".isNotNull)
val query = sdvDF.writeStream
.format("console")
.option("truncate","false")
.start()
.awaitTermination()
}
}