Empty RDD while consuming messages from Rabbitmq in sparks scala - scala

Below is the spark streaming code to consume messages from RabbitMQ.
import java.io.{ BufferedReader, InputStreamReader }
import java.net.Socket
import java.nio.charset.StandardCharsets
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.streaming.receiver.Receiver
class CustomReceiver
extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {
var factory: ConnectionFactory = _
var consumer: QueueingConsumer = _
var rabbitMQconnection: Connection = null;
var channel: Channel = null
var host = "****"
var port = "****"
var queueName = "test"
var virtualHost = "host"
var userName = "name"
var password = "****"
def onStart() {
// Start the thread that receives data over a connection
new Thread("Socket Receiver") {
override def run() { receive() }
}.start()
}
def onStop() {
// There is nothing much to do as the thread calling receive()
// is designed to stop by itself isStopped() returns false
}
/** Create a socket connection and receive data until receiver is stopped */
private def receive() {
var socket: Socket = null
var userInput: String = null
try {
val QUEUE_NAME = "test"
if (rabbitMQconnection == null) {
rabbitMQconnection = getRabbitMQConnection
channel = rabbitMQconnection.createChannel();
println("rabbitMQ Connection created");
if (rabbitMQconnection != null) {
println("rabbitMQ Connection created")
}
}
val delayArgs = Map[String, AnyRef]("x-message-ttl" -> Long.box(50000))
channel.queueDeclare("test",true,false,false,delayArgs)
consumer = new QueueingConsumer(channel);
channel.basicConsume("test",true,consumer)
val delivery = consumer.nextDelivery()
val ty = unzip(delivery.getBody())
println("at consumer : " +ty)
store(ty)
channel.basicAck(delivery.getEnvelope().getDeliveryTag(), false);
} catch {
case e: java.net.ConnectException =>
restart("Error connecting to ")
case t: Throwable =>
restart("Error receiving data", t)
}
}
def getRabbitMQConnection()={
val factory = new ConnectionFactory()
factory.setUsername("name")
factory.setPassword("password")
factory.setHost("****")
factory.setVirtualHost("*****")
factory.setAutomaticRecoveryEnabled(true)
factory.setTopologyRecoveryEnabled(true)
factory.newConnection()
}
def unzip(x: Array[Byte]) : String = {
val inputStream = new GZIPInputStream(new ByteArrayInputStream(x))
val output = scala.io.Source.fromInputStream(inputStream).mkString
return output
}
}
import org.apache.spark._
import org.apache.spark.storage._
import org.apache.spark.streaming._
import org.apache.spark.streaming.receiver._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
val batchInterval = Seconds(10)
val ssc = new StreamingContext(sc, batchInterval)
val stream = ssc.receiverStream(new CustomReceiver())
stream.foreachRDD(rdd =>{ print(rdd)
})
ssc.start()
ssc.awaitTermination()
This is giving me empty output as below.When I tried consuming messages using Python pika library(without using spark streaming context) I was able to consume the data and save the messages to storage.
Time: 1642473060000 ms
-------------------------------------------
-------------------------------------------
Time: 1642473070000 ms
-------------------------------------------
Could someone please let me know whats wrong in the above code.

Changing the x-message-ttl setting to the desired value helped me resolved the issue.x-message-ttl value should be same as the value which is declared in the rabbitmq queue.

Related

How to make source function that polls an http endpoint into flink stream for every 1 hour?

I am trying to have a source which poll http endpoint every 1 hour and keep that as flink source to broadcast to operators.
I tried to make it as simple function but seems not working as expected.
Code is :
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext
import org.apache.http.{HttpRequest, HttpResponse}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.bootstrap.{HttpServer, ServerBootstrap}
import org.apache.http.protocol.{HttpContext, HttpRequestHandler}
import java.util.concurrent.TimeUnit
class HttpStreamFun(url: String) extends SourceFunction[String] {
#transient private var server: HttpServer = _
override def run(ctx: SourceContext[String]): Unit = {
server = ServerBootstrap
.bootstrap()
.registerHandler(
url,
new HttpRequestHandler() {
override def handle(req: HttpRequest,
rep: HttpResponse,
context: HttpContext): Unit = {
ctx.collect(req.getRequestLine.getUri)
rep.setStatusCode(200)
rep.setEntity(new StringEntity("OK"))
}
}
)
.create()
server.start()
server.awaitTermination(1, TimeUnit.HOURS)
}
override def cancel(): Unit = {
server.stop()
}
}
Main job has these to add the source as datastream:
val text: DataStream[String] = env.addSource(new HttpStreamFun(config.baseUri))
text.print()
Maybe you can try to use BroadcastStream
For more information, refer to
https://ci.apache.org/projects/flink/flink-docs-master/docs/dev/datastream/fault-tolerance/broadcast_state/

How to real test the akka stream?

I am using WebsocketClient and would like to test against the received message. I've chosen the Scalatest framework and I know, that the test has be carry out asynchronously.
The websocket client looks as the following:
import akka.{Done}
import akka.http.scaladsl.Http
import akka.stream.scaladsl._
import akka.http.scaladsl.model.ws._
import io.circe.syntax._
import scala.concurrent.Future
object WsClient {
import Trigger._
private val convertJson: PreMsg => String = msg =>
msg.asJson.noSpaces
val send: PreMsg => (String => Unit) => RunnableGraph[Future[Done]] = msg => fn =>
Source.single(convertJson(msg))
.map(TextMessage(_))
.via(Http().webSocketClientFlow(WebSocketRequest(s"ws://{$Config.host}:{$Config.port}/saprs")))
.map(_.asTextMessage.getStrictText)
.toMat(Sink.foreach(fn))(Keep.right)
}
and the test:
feature("Process incoming messages") {
info("As a user, I want that incoming messages is going to process appropriately.")
info("A message should contain the following properties: `sap_id`, `sap_event`, `payload`")
scenario("Message is not intended for the server") {
Given("A message with `sap_id:unknown`")
val msg = PreMsg("unknown", "unvalid", "{}")
When("the message gets validated")
val ws = WsClient.send(msg)
Then("it should has the `status: REJECT` in the response content")
ws { msg =>
//Would like test against the msg here
}.run()
.map(_ => assert(1 == 1))
}
I would to test against the content of msg, but I do not know, how to do it.
I followed the play-scala-websocket-example
They use a WebSocketClient as a helper, see WebSocketClient.java
Then a test looks like:
Helpers.running(TestServer(port, app)) {
val myPublicAddress = s"localhost:$port"
val serverURL = s"ws://$myPublicAddress/ws"
val asyncHttpClient: AsyncHttpClient = client.underlying[AsyncHttpClient]
val webSocketClient = new WebSocketClient(asyncHttpClient)
val queue = new ArrayBlockingQueue[String](10)
val origin = serverURL
val consumer: Consumer[String] = new Consumer[String] {
override def accept(message: String): Unit = queue.put(message)
}
val listener = new WebSocketClient.LoggingListener(consumer)
val completionStage = webSocketClient.call(serverURL, origin, listener)
val f = FutureConverters.toScala(completionStage)
// Test we can get good output from the websocket
whenReady(f, timeout = Timeout(1.second)) { webSocket =>
val condition: Callable[java.lang.Boolean] = new Callable[java.lang.Boolean] {
override def call(): java.lang.Boolean = webSocket.isOpen && queue.peek() != null
}
await().until(condition)
val input: String = queue.take()
val json:JsValue = Json.parse(input)
val symbol = (json \ "symbol").as[String]
List(symbol) must contain oneOf("AAPL", "GOOG", "ORCL")
}
}
}
See here: FunctionalSpec.scala

Connect to Amazon account using Scala

I want to connect to my Amazon account in order to delete resources inside my s3 storage.
I have the access key and secret key, and this is how I started to build my connection to Amazon:
def connectToAmaozn(): Unit = {
val AWS_ACCESS_KEY=conf.getString("WebRecorder.PushSession.AccessKey")
val AWS_SECRET_KEY=conf.getString("WebRecorder.PushSession.SecretKey")
val AWSCredentials = new BasicAWSCredentials(AWS_ACCESS_KEY,AWS_SECRET_KEY)
}
Can you elaborate on how I may do this?
I used this solution to get bucket name and number of objects:
import scala.collection.JavaConversions._
import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
import com.amazonaws.services.s3
import com.amazonaws.services.s3.model.{GetObjectTaggingRequest, ObjectListing, S3ObjectSummary}
import com.amazonaws.services.s3.{AmazonS3Client, AmazonS3ClientBuilder}
import com.clicktale.pipeline.framework.dal.ConfigParser.conf
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model._
import scala.language.postfixOps
class Amazon {
val AWS_ACCESS_KEY = conf.getString("WebRecorder.PushSession.AccessKey")
val AWS_SECRET_KEY = conf.getString("WebRecorder.PushSession.SecretKey")
val bucketName = "nv-q-s3-assets-01"
val provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(AWS_ACCESS_KEY, AWS_SECRET_KEY))
val client = AmazonS3ClientBuilder.standard().withCredentials(provider).withRegion("us-east-1").build()
// def connectToAmazon(): Unit = {
//
// val provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(AWS_ACCESS_KEY, AWS_SECRET_KEY))
// val client = AmazonS3ClientBuilder.standard().withCredentials(provider).withRegion("us-east-1").build()
def removeObjectsFromBucket(){
println("Removing objects from bucket...")
var object_listing: ObjectListing = client.listObjects(bucketName)
var flag: Boolean = true
while (flag) {
val iterator: Iterator[_] = object_listing.getObjectSummaries.iterator()
while (iterator.hasNext) {
val summary: S3ObjectSummary = iterator.next().asInstanceOf[S3ObjectSummary]
client.deleteObject(bucketName, summary.getKey())
}
flag=false
}
}
def countNumberOfObjectsInsideBucket(): Unit ={
var object_listing: ObjectListing = client.listObjects(bucketName)
var flag: Boolean = true
var count=0
while (flag) {
val iterator: Iterator[_] = object_listing.getObjectSummaries.iterator()
while (iterator.hasNext) {
val summary: S3ObjectSummary = iterator.next().asInstanceOf[S3ObjectSummary]
count+=1
}
flag=false
println("Number of objects are: " + count)
}
}
}
You need a AWSCredentialsProvider:
val provider = new AWSStaticCredentialsProvider(
new BasicAWSCredentials(AWS_ACCESS_KEY,AWS_SECRET_KEY)
)
and then use it to create the client:
val client = AmazonS3ClientBuilder
.standard
.withCredentials(provider)
.withRegion("us-west-1") // or whatever your region is
.build

Spark Scala UDP receive on listening port

The example mentioned in
http://spark.apache.org/docs/latest/streaming-programming-guide.html
Lets me receive data packets in a TCP stream and listening on port 9999
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 2 cores to prevent from a starvation scenario.
val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
val ssc = new StreamingContext(conf, Seconds(1))
// Create a DStream that will connect to hostname:port, like localhost:9999
val lines = ssc.socketTextStream("localhost", 9999)
// Split each line into words
val words = lines.flatMap(_.split(" "))
import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
// Count each word in each batch
val pairs = words.map(word => (word, 1))
val wordCounts = pairs.reduceByKey(_ + _)
// Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.print()
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
I am able to send data over TCP by creating a data server by using in my Linux system
$ nc -lk 9999
Question
I need to receive stream from an android phone streaming using UDP and the Scala/Spark
val lines = ssc.socketTextStream("localhost", 9999)
receives ONLY in TCP streams.
How can I receive UDP streams in a similar simple manner using Scala+Spark and create Spark DStream.
There isn't something built in, but it's not too much work to get it done youself. Here is a simple solution I made based on a custom UdpSocketInputDStream[T]:
import java.io._
import java.net.{ConnectException, DatagramPacket, DatagramSocket, InetAddress}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import scala.reflect.ClassTag
import scala.util.control.NonFatal
class UdpSocketInputDStream[T: ClassTag](
_ssc: StreamingContext,
host: String,
port: Int,
bytesToObjects: InputStream => Iterator[T],
storageLevel: StorageLevel
) extends ReceiverInputDStream[T](_ssc) {
def getReceiver(): Receiver[T] = {
new UdpSocketReceiver(host, port, bytesToObjects, storageLevel)
}
}
class UdpSocketReceiver[T: ClassTag](host: String,
port: Int,
bytesToObjects: InputStream => Iterator[T],
storageLevel: StorageLevel) extends Receiver[T](storageLevel) {
var udpSocket: DatagramSocket = _
override def onStart(): Unit = {
try {
udpSocket = new DatagramSocket(port, InetAddress.getByName(host))
} catch {
case e: ConnectException =>
restart(s"Error connecting to $port", e)
return
}
// Start the thread that receives data over a connection
new Thread("Udp Socket Receiver") {
setDaemon(true)
override def run() {
receive()
}
}.start()
}
/** Create a socket connection and receive data until receiver is stopped */
def receive() {
try {
val buffer = new Array[Byte](2048)
// Create a packet to receive data into the buffer
val packet = new DatagramPacket(buffer, buffer.length)
udpSocket.receive(packet)
val iterator = bytesToObjects(new ByteArrayInputStream(packet.getData, packet.getOffset, packet.getLength))
// Now loop forever, waiting to receive packets and printing them.
while (!isStopped() && iterator.hasNext) {
store(iterator.next())
}
if (!isStopped()) {
restart("Udp socket data stream had no more data")
}
} catch {
case NonFatal(e) =>
restart("Error receiving data", e)
} finally {
onStop()
}
}
override def onStop(): Unit = {
synchronized {
if (udpSocket != null) {
udpSocket.close()
udpSocket = null
}
}
}
}
In order to get StreamingContext to add a method on itself, we enrich it with an implicit class:
object Implicits {
implicit class StreamingContextOps(val ssc: StreamingContext) extends AnyVal {
def udpSocketStream[T: ClassTag](host: String,
port: Int,
converter: InputStream => Iterator[T],
storageLevel: StorageLevel): InputDStream[T] = {
new UdpSocketInputDStream(ssc, host, port, converter, storageLevel)
}
}
}
And here is how you call it all:
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import org.apache.spark.SparkContext
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.reflect.ClassTag
object TestRunner {
import Implicits._
def main(args: Array[String]): Unit = {
val sparkContext = new SparkContext("local[*]", "udpTest")
val ssc = new StreamingContext(sparkContext, Seconds(4))
val stream = ssc.udpSocketStream("localhost",
3003,
bytesToLines,
StorageLevel.MEMORY_AND_DISK_SER_2)
stream.print()
ssc.start()
ssc.awaitTermination()
}
def bytesToLines(inputStream: InputStream): Iterator[String] = {
val dataInputStream = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8))
new NextIterator[String] {
protected override def getNext(): String = {
val nextValue = dataInputStream.readLine()
if (nextValue == null) {
finished = true
}
nextValue
}
protected override def close() {
dataInputStream.close()
}
}
}
abstract class NextIterator[U] extends Iterator[U] {
protected var finished = false
private var gotNext = false
private var nextValue: U = _
private var closed = false
override def next(): U = {
if (!hasNext) {
throw new NoSuchElementException("End of stream")
}
gotNext = false
nextValue
}
override def hasNext: Boolean = {
if (!finished) {
if (!gotNext) {
nextValue = getNext()
if (finished) {
closeIfNeeded()
}
gotNext = true
}
}
!finished
}
def closeIfNeeded() {
if (!closed) {
closed = true
close()
}
}
protected def getNext(): U
protected def close()
}
}
Most of this code is taken from the SocketInputDStream[T] provided by Spark, I simply re-used it. I also took the code for the NextIterator which is used by bytesToLines, all it does is consume the line from the packet and transform it to a String. If you have more complex logic, you can provide it by passing converter: InputStream => Iterator[T] your own implementation.
Testing it with simple UDP packet:
echo -n "hello hello hello!" >/dev/udp/localhost/3003
Yields:
-------------------------------------------
Time: 1482676728000 ms
-------------------------------------------
hello hello hello!
Of course, this has to be further tested. I also has a hidden assumption that each buffer created from the DatagramPacket is 2048 bytes, which is perhaps something you'll want to change.
The problem with the Yuval Itzchakov's solution is that the receiver receives one message and restarts itself. Just replace restart for receive as shown below.
def receive() {
try {
val buffer = new Array[Byte](200000)
// Create a packet to receive data into the buffer
val packet = new DatagramPacket(buffer, buffer.length)
udpSocket.receive(packet)
val iterator = bytesToLines(new ByteArrayInputStream(packet.getData, packet.getOffset, packet.getLength))
// Now loop forever, waiting to receive packets and printing them.
while (!isStopped() && iterator.hasNext) {
store(iterator)
}
if (!isStopped()) {
// restart("Udp socket data stream had no more data")
receive()
}
} catch {
case NonFatal(e) =>
restart("Error receiving data", e)
} finally {
onStop()
}
}

Akka-http process requests with Stream

I try write some simple akka-http and akka-streams based application, that handle http requests, always with one precompiled stream, because I plan to use long time processing with back-pressure in my requestProcessor stream
My application code:
import akka.actor.{ActorSystem, Props}
import akka.http.scaladsl._
import akka.http.scaladsl.server.Directives._
import akka.http.scaladsl.server._
import akka.stream.ActorFlowMaterializer
import akka.stream.actor.ActorPublisher
import akka.stream.scaladsl.{Sink, Source}
import scala.annotation.tailrec
import scala.concurrent.Future
object UserRegisterSource {
def props: Props = Props[UserRegisterSource]
final case class RegisterUser(username: String)
}
class UserRegisterSource extends ActorPublisher[UserRegisterSource.RegisterUser] {
import UserRegisterSource._
import akka.stream.actor.ActorPublisherMessage._
val MaxBufferSize = 100
var buf = Vector.empty[RegisterUser]
override def receive: Receive = {
case request: RegisterUser =>
if (buf.isEmpty && totalDemand > 0)
onNext(request)
else {
buf :+= request
deliverBuf()
}
case Request(_) =>
deliverBuf()
case Cancel =>
context.stop(self)
}
#tailrec final def deliverBuf(): Unit =
if (totalDemand > 0) {
if (totalDemand <= Int.MaxValue) {
val (use, keep) = buf.splitAt(totalDemand.toInt)
buf = keep
use foreach onNext
} else {
val (use, keep) = buf.splitAt(Int.MaxValue)
buf = keep
use foreach onNext
deliverBuf()
}
}
}
object Main extends App {
val host = "127.0.0.1"
val port = 8094
implicit val system = ActorSystem("my-testing-system")
implicit val fm = ActorFlowMaterializer()
implicit val executionContext = system.dispatcher
val serverSource: Source[Http.IncomingConnection, Future[Http.ServerBinding]] = Http(system).bind(interface = host, port = port)
val mySource = Source.actorPublisher[UserRegisterSource.RegisterUser](UserRegisterSource.props)
val requestProcessor = mySource
.mapAsync(1)(fakeSaveUserAndReturnCreatedUserId)
.to(Sink.head[Int])
.run()
val route: Route =
get {
path("test") {
parameter('test) { case t: String =>
requestProcessor ! UserRegisterSource.RegisterUser(t)
???
}
}
}
def fakeSaveUserAndReturnCreatedUserId(param: UserRegisterSource.RegisterUser): Future[Int] =
Future.successful {
1
}
serverSource.to(Sink.foreach {
connection =>
connection handleWith Route.handlerFlow(route)
}).run()
}
I found solution about how create Source that can dynamically accept new items to process, but I can found any solution about how than obtain result of stream execution in my route
The direct answer to your question is to materialize a new Stream for each HttpRequest and use Sink.head to get the value you're looking for. Modifying your code:
val requestStream =
mySource.map(fakeSaveUserAndReturnCreatedUserId)
.to(Sink.head[Int])
//.run() - don't materialize here
val route: Route =
get {
path("test") {
parameter('test) { case t: String =>
//materialize a new Stream here
val userIdFut : Future[Int] = requestStream.run()
requestProcessor ! UserRegisterSource.RegisterUser(t)
//get the result of the Stream
userIdFut onSuccess { case userId : Int => ...}
}
}
}
However, I think your question is ill posed. In your code example the only thing you're using an akka Stream for is to create a new UserId. Futures readily solve this problem without the need for a materialized Stream (and all the accompanying overhead):
val route: Route =
get {
path("test") {
parameter('test) { case t: String =>
val user = RegisterUser(t)
fakeSaveUserAndReturnCreatedUserId(user) onSuccess { case userId : Int =>
...
}
}
}
}
If you want to limit the number of concurrent calls to fakeSaveUserAndReturnCreateUserId then you can create an ExecutionContext with a defined ThreadPool size, as explained in the answer to this question, and use that ExecutionContext to create the Futures:
val ThreadCount = 10 //concurrent queries
val limitedExecutionContext =
ExecutionContext.fromExecutor(Executors.newFixedThreadPool(ThreadCount))
def fakeSaveUserAndReturnCreatedUserId(param: UserRegisterSource.RegisterUser): Future[Int] =
Future { 1 }(limitedExecutionContext)