How to send multiple files to kafka producer using akka stream in Scala - scala

I am trying to send multiple data to kafka producer using akka stream , meanwhile I wrote the producer itself , but struggling of how to use akka-streamIO in order to get multiple files which will be the data I want to send to my kafka Producer this is my code:
object App {
def main(args: Array[String]): Unit = {
val file = Paths.get("233339.8.1231731728115136.1722327129833578.log")
// val file = Paths.get("example.csv")
//
// val foreach: Future[IOResult] = FileIO.fromPath(file)
// .to(Sink.ignore)
// .run()
println("Hello from producer")
implicit val system:ActorSystem = ActorSystem("producer-example")
implicit val materializer:Materializer = ActorMaterializer()
val producerSettings = ProducerSettings(system,new StringSerializer,new StringSerializer)
val done: Future[Done] =
Source(1 to 955)
.map(value => new ProducerRecord[String, String]("test-topic", s"$file : $value"))
.runWith(Producer.plainSink(producerSettings))
implicit val ec: ExecutionContextExecutor = system.dispatcher
done onComplete {
case Success(_) => println("Done"); system.terminate()
case Failure(err) => println(err.toString); system.terminate()
}
}
}

Given multiple file names:
val fileNames : Iterable[String] = ???
It is possible to create a Source that emits the contents of the files concatenated together using flatMapConcat:
val chunkSize = 8192
val chunkSource : Source[ByteString, _] =
Source.apply(fileNames)
.map(fileName => Paths get fileName)
.flatMapConcat(path => FileIO.fromPath(path, chunkSize))
This will emit fixed size ByteString values that are all chunkSize length, except for possibly the last value which may be smaller.
If you want to breakup the lines by some delimiter then you can use Framing:
val delimiter : ByteString = ???
val maxFrameLength : Int = ???
val framingSource : Source[ByteString, _] =
chunkSource.via(Framing.delimiter(delimiter, maxFrameLength))

Related

Akka - convert Flow into Collection or Publisher

I'm trying to split an Akka Source into two separate ones.
val requestFlow = Flow[BodyPartEntity].to(Sink.seq) // convert to Seq[BodyPartEntity]
val dataFlow = Flow[BodyPartEntity].to(Sink.asPublisher(fanout = false)) // convert to Publisher[BodyPartEntity]
implicit class EitherSourceExtension[L, R, Mat](source: Source[FormData.BodyPart, Mat]) {
def partition(left: Sink[BodyPartEntity, NotUsed], right: Sink[BodyPartEntity, NotUsed]): Graph[ClosedShape, NotUsed] = {
GraphDSL.create() { implicit builder =>
import akka.stream.scaladsl.GraphDSL.Implicits._
val partition = builder.add(Partition[FormData.BodyPart](2, element => if (element.getName == "request") 0 else 1))
source ~> partition.in
partition.out(0).map(_.getEntity) ~> left
partition.out(1).map(_.getEntity) ~> right
ClosedShape
}
}
}
How to convert requestFlow into Seq[BodyPartEntity] and dataFlow into Publisher[BodyPartEntity]
You could use a BroadcastHub for this. From doc:
A BroadcastHub can be used to consume elements from a common producer by a dynamic set of consumers.
Simplified code:
val runnableGraph: RunnableGraph[Source[Int, NotUsed]] =
Source(1 to 5).toMat(
BroadcastHub.sink(bufferSize = 4))(Keep.right)
val fromProducer: Source[Int, NotUsed] = runnableGraph.run()
// Process the messages from the producer in two independent consumers
fromProducer.runForeach(msg => println("consumer1: " + msg))
fromProducer.runForeach(msg => println("consumer2: " + msg))

How to using Akka Stream with Akk-Http to stream the response

I'm new to Akka Stream. I used following code for CSV parsing.
class CsvParser(config: Config)(implicit system: ActorSystem) extends LazyLogging with NumberValidation {
import system.dispatcher
private val importDirectory = Paths.get(config.getString("importer.import-directory")).toFile
private val linesToSkip = config.getInt("importer.lines-to-skip")
private val concurrentFiles = config.getInt("importer.concurrent-files")
private val concurrentWrites = config.getInt("importer.concurrent-writes")
private val nonIOParallelism = config.getInt("importer.non-io-parallelism")
def save(r: ValidReading): Future[Unit] = {
Future()
}
def parseLine(filePath: String)(line: String): Future[Reading] = Future {
val fields = line.split(";")
val id = fields(0).toInt
try {
val value = fields(1).toDouble
ValidReading(id, value)
} catch {
case t: Throwable =>
logger.error(s"Unable to parse line in $filePath:\n$line: ${t.getMessage}")
InvalidReading(id)
}
}
val lineDelimiter: Flow[ByteString, ByteString, NotUsed] =
Framing.delimiter(ByteString("\n"), 128, allowTruncation = true)
val parseFile: Flow[File, Reading, NotUsed] =
Flow[File].flatMapConcat { file =>
val src = FileSource.fromFile(file).getLines()
val source : Source[String, NotUsed] = Source.fromIterator(() => src)
// val gzipInputStream = new GZIPInputStream(new FileInputStream(file))
source
.mapAsync(parallelism = nonIOParallelism)(parseLine(file.getPath))
}
val computeAverage: Flow[Reading, ValidReading, NotUsed] =
Flow[Reading].grouped(2).mapAsyncUnordered(parallelism = nonIOParallelism) { readings =>
Future {
val validReadings = readings.collect { case r: ValidReading => r }
val average = if (validReadings.nonEmpty) validReadings.map(_.value).sum / validReadings.size else -1
ValidReading(readings.head.id, average)
}
}
val storeReadings: Sink[ValidReading, Future[Done]] =
Flow[ValidReading]
.mapAsyncUnordered(concurrentWrites)(save)
.toMat(Sink.ignore)(Keep.right)
val processSingleFile: Flow[File, ValidReading, NotUsed] =
Flow[File]
.via(parseFile)
.via(computeAverage)
def importFromFiles = {
implicit val materializer = ActorMaterializer()
val files = importDirectory.listFiles.toList
logger.info(s"Starting import of ${files.size} files from ${importDirectory.getPath}")
val startTime = System.currentTimeMillis()
val balancer = GraphDSL.create() { implicit builder =>
import GraphDSL.Implicits._
val balance = builder.add(Balance[File](concurrentFiles))
val merge = builder.add(Merge[ValidReading](concurrentFiles))
(1 to concurrentFiles).foreach { _ =>
balance ~> processSingleFile ~> merge
}
FlowShape(balance.in, merge.out)
}
Source(files)
.via(balancer)
.withAttributes(ActorAttributes.supervisionStrategy { e =>
logger.error("Exception thrown during stream processing", e)
Supervision.Resume
})
.runWith(storeReadings)
.andThen {
case Success(_) =>
val elapsedTime = (System.currentTimeMillis() - startTime) / 1000.0
logger.info(s"Import finished in ${elapsedTime}s")
case Failure(e) => logger.error("Import failed", e)
}
}
}
I wanted to to use Akka HTTP which would give all ValidReading entities parsed from CSV but I couldn't understand on how would I do that.
The above code fetches file from server and parse each lines to generate ValidReading.
How can I pass/upload CSV via akka-http, parse the file and stream the resulted response back to the endpoint?
The "essence" of the solution is something like this:
import akka.http.scaladsl.server.Directives._
val route = fileUpload("csv") {
case (metadata, byteSource) =>
val source = byteSource.map(x => x)
complete(HttpResponse(entity = HttpEntity(ContentTypes.`text/csv(UTF-8)`, source)))
}
You detect that the uploaded thing is a multipart-form-data with a chunk named "csv". You get the byteSource from that. Do the calculation (insert your logic to the .map(x=>x) part). Convert your data back to ByteString. Complete the request with the new source. This will make your endoint like a proxy.

Akka Streams: Concatenating streams that initialise resources

I have two streams: A and B.
A is writing a file to disk and appends its elements during its execution.
B is reading that file and does some processing with the data.
Only AFTER A is done with processing and writing its data to the file, I want to start with B.
I tried to concat the two streams with:
A.concat(
Source.lazily { () =>
println("B is getting initialised")
getStreamForB()
}
)
But this is already initialising B BEFORE A has finished.
There is a ticket tracking the fact that Source#concat does not support lazy materialization. That ticket mentions the following work-around:
implicit class SourceLazyOps[E, M](val src: Source[E, M]) {
def concatLazy[M1](src2: => Source[E, M1]): Source[E, NotUsed] =
Source(List(() => src, () => src2)).flatMapConcat(_())
}
Applying the above implicit class to your case:
A.concatLazy(
Source.lazily { () =>
println("B is getting initialised")
getStreamForB()
}
)
The FileIO.toPath method will materialize the stream into a Future[IOResult]. If you are working with stream A that is writing to a file:
val someDataSource : Source[ByteString, _] = ???
val filePath : Path = ???
val fileWriteOptions : Set[OpenOption] = ???
val A : Future[IOResult] =
someDataSource
.to(FileIO.toPath(filePath, fileWriteOptions))
.run()
You can use the materialized Future to kick off your stream B once the writing is completed:
val fileReadOptions : Set[OpenOption] = ???
val someProcessingWithTheDataOfB : Sink[ByteString, _] = ???
A foreach { _ =>
val B : Future[IOResult] =
FileIO
.fromPath(filePath, fileReadOptions)
.to(someProcessingWithTheDataOfB)
.run()
}
Similarly, you could do some testing of the IOResult before doing the reading to make sure there were no failures during the writing process:
A.filter(ioResult => ioResult.status.isSuccess)
.foreach { _ =>
val B : Future[IOResult] =
FileIO
.fromPath(filePath, readOptions)
.to(someProcessingOfB)
.run()
}

File Upload and processing using akka-http websockets

I'm using some sample Scala code to make a server that receives a file over websocket, stores the file temporarily, runs a bash script on it, and then returns stdout by TextMessage.
Sample code was taken from this github project.
I edited the code slightly within echoService so that it runs another function that processes the temporary file.
object WebServer {
def main(args: Array[String]) {
implicit val actorSystem = ActorSystem("akka-system")
implicit val flowMaterializer = ActorMaterializer()
val interface = "localhost"
val port = 3000
import Directives._
val route = get {
pathEndOrSingleSlash {
complete("Welcome to websocket server")
}
} ~
path("upload") {
handleWebSocketMessages(echoService)
}
val binding = Http().bindAndHandle(route, interface, port)
println(s"Server is now online at http://$interface:$port\nPress RETURN to stop...")
StdIn.readLine()
binding.flatMap(_.unbind()).onComplete(_ => actorSystem.shutdown())
println("Server is down...")
}
implicit val actorSystem = ActorSystem("akka-system")
implicit val flowMaterializer = ActorMaterializer()
val echoService: Flow[Message, Message, _] = Flow[Message].mapConcat {
case BinaryMessage.Strict(msg) => {
val decoded: Array[Byte] = msg.toArray
val imgOutFile = new File("/tmp/" + "filename")
val fileOuputStream = new FileOutputStream(imgOutFile)
fileOuputStream.write(decoded)
fileOuputStream.close()
TextMessage(analyze(imgOutFile))
}
case BinaryMessage.Streamed(stream) => {
stream
.limit(Int.MaxValue) // Max frames we are willing to wait for
.completionTimeout(50 seconds) // Max time until last frame
.runFold(ByteString(""))(_ ++ _) // Merges the frames
.flatMap { (msg: ByteString) =>
val decoded: Array[Byte] = msg.toArray
val imgOutFile = new File("/tmp/" + "filename")
val fileOuputStream = new FileOutputStream(imgOutFile)
fileOuputStream.write(decoded)
fileOuputStream.close()
Future(Source.single(""))
}
TextMessage(analyze(imgOutFile))
}
private def analyze(imgfile: File): String = {
val p = Runtime.getRuntime.exec(Array("./run-vision.sh", imgfile.toString))
val br = new BufferedReader(new InputStreamReader(p.getInputStream, StandardCharsets.UTF_8))
try {
val result = Stream
.continually(br.readLine())
.takeWhile(_ ne null)
.mkString
result
} finally {
br.close()
}
}
}
}
During testing using Dark WebSocket Terminal, case BinaryMessage.Strict works fine.
Problem: However, case BinaryMessage.Streaming doesn't finish writing the file before running the analyze function, resulting in a blank response from the server.
I'm trying to wrap my head around how Futures are being used here with the Flows in Akka-HTTP, but I'm not having much luck outside trying to get through all the official documentation.
Currently, .mapAsync seems promising, or basically finding a way to chain futures.
I'd really appreciate some insight.
Yes, mapAsync will help you in this occasion. It is a combinator to execute Futures (potentially in parallel) in your stream, and present their results on the output side.
In your case to make things homogenous and make the type checker happy, you'll need to wrap the result of the Strict case into a Future.successful.
A quick fix for your code could be:
val echoService: Flow[Message, Message, _] = Flow[Message].mapAsync(parallelism = 5) {
case BinaryMessage.Strict(msg) => {
val decoded: Array[Byte] = msg.toArray
val imgOutFile = new File("/tmp/" + "filename")
val fileOuputStream = new FileOutputStream(imgOutFile)
fileOuputStream.write(decoded)
fileOuputStream.close()
Future.successful(TextMessage(analyze(imgOutFile)))
}
case BinaryMessage.Streamed(stream) =>
stream
.limit(Int.MaxValue) // Max frames we are willing to wait for
.completionTimeout(50 seconds) // Max time until last frame
.runFold(ByteString(""))(_ ++ _) // Merges the frames
.flatMap { (msg: ByteString) =>
val decoded: Array[Byte] = msg.toArray
val imgOutFile = new File("/tmp/" + "filename")
val fileOuputStream = new FileOutputStream(imgOutFile)
fileOuputStream.write(decoded)
fileOuputStream.close()
Future.successful(TextMessage(analyze(imgOutFile)))
}
}

Spark Streaming - Broadcast variable - Case Class

My requirement is to enrich data stream data with profile information from a HBase table. I was looking to use a broadcast variable. Enclosed the whole code here.
The output of HBase data is as follows
In the Driver node HBaseReaderBuilder
(org.apache.spark.SparkContext#3c58b102,hbase_customer_profile,Some(data),WrappedArray(gender, age),None,None,List()))
In the Worker node
HBaseReaderBuilder(null,hbase_customer_profile,Some(data),WrappedArray(gender, age),None,None,List()))
As you can see it has lost the spark context. When i issue the statement val
myRdd = bcdocRdd.map(r => Profile(r._1, r._2, r._3)) i get a NullPointerException
java.lang.NullPointerException
at it.nerdammer.spark.hbase.HBaseReaderBuilderConversions$class.toSimpleHBaseRDD(HBaseReaderBuilder.scala:83)
at it.nerdammer.spark.hbase.package$.toSimpleHBaseRDD(package.scala:5)
at it.nerdammer.spark.hbase.HBaseReaderBuilderConversions$class.toHBaseRDD(HBaseReaderBuilder.scala:67)
at it.nerdammer.spark.hbase.package$.toHBaseRDD(package.scala:5)
at testPartition$$anonfun$main$1$$anonfun$apply$1$$anonfun$apply$2.apply(testPartition.scala:34)
at testPartition$$anonfun$main$1$$anonfun$apply$1$$anonfun$apply$2.apply(testPartition.scala:33)
object testPartition {
def main(args: Array[String]) : Unit = {
val sparkMaster = "spark://x.x.x.x:7077"
val ipaddress = "x.x.x.x:2181" // Zookeeper
val hadoopHome = "/home/hadoop/software/hadoop-2.6.0"
val topicname = "new_events_test_topic"
val mainConf = new SparkConf().setMaster(sparkMaster).setAppName("testingPartition")
val mainSparkContext = new SparkContext(mainConf)
val ssc = new StreamingContext(mainSparkContext, Seconds(30))
val eventsStream = KafkaUtils.createStream(ssc,"x.x.x.x:2181","receive_rest_events",Map(topicname.toString -> 2))
val docRdd = mainSparkContext.hbaseTable[(String, Option[String], Option[String])]("hbase_customer_profile").select("gender","age").inColumnFamily("data")
println ("docRDD from Driver ",docRdd)
val broadcastedprof = mainSparkContext.broadcast(docRdd)
eventsStream.foreachRDD(dstream => {
dstream.foreachPartition(records => {
println("Broadcasted docRDD - in Worker ", broadcastedprof.value)
val bcdocRdd = broadcastedprof.value
records.foreach(record => {
//val myRdd = bcdocRdd.map(r => Profile(r._1, r._2, r._3))
//myRdd.foreach(println)
val Rows = record._2.split("\r\n")
})
})
})
ssc.start()
ssc.awaitTermination()
}
}