How to read from embedded-kafka with fs2-kafka - scala

I am using fs2-kafka to read from embedded-kafka.
I create the embedded kafka using withRunningKafkaOnFoundPort, create topic and publish a few messages. However when I try to read it back with fs2-kafka I get a NullPointerException. I have isolated a test case and the code is below.
Here is my code:
import cats.effect._
import cats.implicits._
import cats.effect.implicits._
import fs2.Stream
import fs2.kafka.{AutoOffsetReset, ConsumerSettings, KafkaConsumer, consumerStream}
import net.manub.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import scala.concurrent.ExecutionContext
class KafkaSuite extends FunSuite with EmbeddedKafka {
val singleThreadExecutor = ExecutionContext.fromExecutor((task: Runnable) => task.run())
implicit val contextShift = IO.contextShift(singleThreadExecutor)
implicit val timer = IO.timer(singleThreadExecutor)
val topic = "example"
val partition = 0
val clientId = "client"
test("works") {
val userDefinedConfig = EmbeddedKafkaConfig(kafkaPort = 0, zooKeeperPort = 0)
withRunningKafkaOnFoundPort(userDefinedConfig) { implicit actualConfig =>
createCustomTopic(topic)
publishStringMessageToKafka(topic, "example-message1")
publishStringMessageToKafka(topic, "example-message2")
publishStringMessageToKafka(topic, "example-message3")
publishStringMessageToKafka(topic, "example-message4")
val broker = s"localhost:${actualConfig.kafkaPort}"
val consumerSettings = ConsumerSettings[IO, String, String]
.withAutoOffsetReset(AutoOffsetReset.Earliest)
.withBootstrapServers(broker)
.withGroupId("group")
.withClientId(clientId)
val r = consumerStream[IO].using(consumerSettings)
.evalTap(_.subscribeTo(topic))
.evalTap(_.seekToBeginning)
.flatMap { consumer =>
consumer.stream.take(1)
}
.compile
.toList
val res = r.unsafeRunSync()
Console.println(res)
assert(res.size == 1)
}
}
}
build.sbt:
name := "test"
version := "0.1"
scalaVersion := "2.12.6"
libraryDependencies ++= Seq(
"org.scalatest" % "scalatest_2.12" % "3.1.2" % "test",
"org.slf4j" % "slf4j-simple" % "1.7.25",
"com.github.fd4s" %% "fs2-kafka" % "1.0.0",
"io.github.embeddedkafka" %% "embedded-kafka" % "2.4.1.1" % Test
)
An here is the stacktrace:
java.lang.NullPointerException was thrown.
java.lang.NullPointerException
at java.lang.String.<init>(String.java:515)
at fs2.kafka.Deserializer$.$anonfun$string$1(Deserializer.scala:208)
at fs2.kafka.Deserializer$.$anonfun$lift$1(Deserializer.scala:184)
at fs2.kafka.Deserializer$$anon$1.deserialize(Deserializer.scala:133)
at fs2.kafka.ConsumerRecord$.deserializeFromBytes(ConsumerRecord.scala:166)
at fs2.kafka.ConsumerRecord$.fromJava(ConsumerRecord.scala:177)
at fs2.kafka.internal.KafkaConsumerActor.$anonfun$records$2(KafkaConsumerActor.scala:378)
at cats.data.NonEmptyVectorInstances$$anon$1.traverse(NonEmptyVector.scala:300)
at cats.data.NonEmptyVectorInstances$$anon$1.traverse(NonEmptyVector.scala:245)
at cats.Traverse$Ops.traverse(Traverse.scala:19)
at cats.Traverse$Ops.traverse$(Traverse.scala:19)
at cats.Traverse$ToTraverseOps$$anon$2.traverse(Traverse.scala:19)
at fs2.kafka.internal.KafkaConsumerActor.$anonfun$records$1(KafkaConsumerActor.scala:376)
at cats.instances.VectorInstances$$anon$1.$anonfun$traverse$2(vector.scala:80)
at cats.instances.VectorInstances$$anon$1.loop$2(vector.scala:43)
at cats.instances.VectorInstances$$anon$1.$anonfun$foldRight$2(vector.scala:44)
at cats.Eval$.advance(Eval.scala:271)
at cats.Eval$.loop$1(Eval.scala:350)
at cats.Eval$.cats$Eval$$evaluate(Eval.scala:368)
at cats.Eval$Defer.value(Eval.scala:257)
at cats.instances.VectorInstances$$anon$1.traverse(vector.scala:79)
at cats.instances.VectorInstances$$anon$1.traverse(vector.scala:15)
at cats.Traverse$Ops.traverse(Traverse.scala:19)
at cats.Traverse$Ops.traverse$(Traverse.scala:19)
at cats.Traverse$ToTraverseOps$$anon$2.traverse(Traverse.scala:19)
at fs2.kafka.internal.KafkaConsumerActor.records(KafkaConsumerActor.scala:373)
at fs2.kafka.internal.KafkaConsumerActor.$anonfun$poll$2(KafkaConsumerActor.scala:405)
at cats.effect.internals.IORunLoop$.liftedTree1$1(IORunLoop.scala:95)
at cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:95)
at cats.effect.internals.IORunLoop$.startCancelable(IORunLoop.scala:41)
at cats.effect.internals.IOBracket$BracketStart.run(IOBracket.scala:86)
at cats.effect.internals.Trampoline.cats$effect$internals$Trampoline$$immediateLoop(Trampoline.scala:70)
at cats.effect.internals.Trampoline.startLoop(Trampoline.scala:36)
at cats.effect.internals.TrampolineEC$JVMTrampoline.super$startLoop(TrampolineEC.scala:93)
at cats.effect.internals.TrampolineEC$JVMTrampoline.$anonfun$startLoop$1(TrampolineEC.scala:93)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81)
at cats.effect.internals.TrampolineEC$JVMTrampoline.startLoop(TrampolineEC.scala:93)
at cats.effect.internals.Trampoline.execute(Trampoline.scala:43)
at cats.effect.internals.TrampolineEC.execute(TrampolineEC.scala:44)
at cats.effect.internals.IOBracket$BracketStart.apply(IOBracket.scala:72)
at cats.effect.internals.IOBracket$BracketStart.apply(IOBracket.scala:52)
at cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:136)
at cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:355)
at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:376)
at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:316)
at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

Turns out the problem is that the key type in ConsumerSettings[IO, String, String] is String but embedded-kafka writes Null as a key, so on deserializing the key it fails with NullPointerException. Setting key type to Unit solves the problem with exception.
Another problem is that withRunningKafkaOnFoundPort finished before the evaluation of the IO starts. To have it running it is needed to make a Resource from embedded-kafka and wrap the IO into that.
val embeddedKafka = Resource.make(IO(EmbeddedKafka.start()))((kafka) => IO(kafka.stop(true)))
Next problem is that fs2-kafka cannot work with a single thread executor so you have to provide it with an executor pool (for example ExecutionContext.global).
Here is a full working example:
import cats.effect._
import fs2.Stream
import fs2.kafka.{AutoOffsetReset, ConsumerSettings, consumerStream}
import net.manub.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
import org.scalatest.FunSuite
import scala.concurrent.ExecutionContext
class KafkaSuite extends FunSuite with EmbeddedKafka {
implicit val ec = ExecutionContext.global
implicit val contextShift = IO.contextShift(ec)
implicit val timer = IO.timer(ec)
val topic = "example"
val partition = 0
val clientId = "client"
val userDefinedConfig = EmbeddedKafkaConfig(kafkaPort = 0, zooKeeperPort = 0)
def broker(port: Long) = s"localhost:${port}"
val consumerSettings = ConsumerSettings[IO, Unit, String]
.withAutoOffsetReset(AutoOffsetReset.Earliest)
.withEnableAutoCommit(true)
.withGroupId("group")
.withClientId(clientId)
val embeddedKafka = Resource.make(IO(EmbeddedKafka.start()))((kafka) => IO(kafka.stop(true)))
test("works") {
val r = Stream.resource(embeddedKafka).flatMap { kafka =>
implicit val actualConfig: EmbeddedKafkaConfig = kafka.config
createCustomTopic(topic)
publishStringMessageToKafka(topic, "example-message1")
publishStringMessageToKafka(topic, "example-message2")
publishStringMessageToKafka(topic, "example-message3")
publishStringMessageToKafka(topic, "example-message4")
consumerStream(consumerSettings.withBootstrapServers(broker(actualConfig.kafkaPort)))
.evalTap(_.subscribeTo(topic))
.evalTap(_.seekToBeginning)
.flatMap(_.stream)
.map(_.record.value)
.take(1)
}
val res = r.compile.toList.unsafeRunSync()
assert(res.contains("example-message1"))
}
}

Related

Why there is a error when I use org.json4s.jackson.Serialization.write from spark

This is my first time use scala and spark. I use this code to extract data. I extracted successfully. If I don't use println(write(output1)) but use println(output1), the code runs well and did print the correct output without keys. But when I use println(write(output1)), I meet error. I use scala 2.12.10 and Java 1.8. My code is as follows:
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.write
import org.apache.spark.SparkContext
import scala.collection.mutable.Map
//import java.io.Writer
object task1 {
def main(args : Array[String]): Unit = {
// Logger.getRootLogger.setLevel(Level.INFO)
val sc = new SparkContext("local[*]", "task1")
val reviewRDD = sc.textFile("review.json")
val n_review = reviewRDD.count().toInt
val n_review_2018 = reviewRDD.filter(x => x contains ",\"date\":\"2018-").count().toInt
val n_user = reviewRDD.map(x => x.split(",")(1).split(":")(1)).distinct().count().toInt
val top10_user = reviewRDD.map(x => (x.split(",")(1).split(":")(1).split("\"")(1), 1)).reduceByKey((x, y) => x + y).sortBy(r => (-r._2, r._1)).take(10)
val n_business = reviewRDD.map(x => x.split(",")(2).split(":")(1)).distinct().count().toInt
val top10_business = reviewRDD.map(x => (x.split(",")(2).split(":")(1).split("\"")(1), 1)).reduceByKey((x, y) => x + y).sortBy(r => (-r._2, r._1)).take(10)
implicit val formats = DefaultFormats
case class Output1(
n_review: Int,
n_review_2018: Int,
n_user: Int,
top10_user: List[(String, Int)],
n_business: Int,
top10_business: List[(String, Int)]
)
val output1 = Output1(
n_review = n_review,
n_review_2018 = n_review_2018,
n_user = n_user,
top10_user = top10_user.toList,
n_business = n_business,
top10_business = top10_business.toList
)
//val output11: String = write(output1)
println(write(output1))
//println(output11)
}
}
The error is as follows:
Exception in thread "main" org.json4s.package$MappingException: Can't find ScalaSig for class task1$Output1$1
at org.json4s.reflect.package$.fail(package.scala:95)
at org.json4s.reflect.ScalaSigReader$.$anonfun$findClass$1(ScalaSigReader.scala:63)
at scala.Option.getOrElse(Option.scala:189)
at org.json4s.reflect.ScalaSigReader$.findClass(ScalaSigReader.scala:63)
at org.json4s.reflect.ScalaSigReader$.readConstructor(ScalaSigReader.scala:31)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.ctorParamType(Reflector.scala:119)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.$anonfun$ctorParamType$3(Reflector.scala:109)
at scala.collection.immutable.List.map(List.scala:297)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.ctorParamType(Reflector.scala:106)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.$anonfun$ctorParamType$3(Reflector.scala:109)
at scala.collection.immutable.List.map(List.scala:293)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.ctorParamType(Reflector.scala:106)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.$anonfun$createConstructorDescriptors$7(Reflector.scala:194)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.$anonfun$createConstructorDescriptors$3(Reflector.scala:177)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.mutable.ArraySeq.foreach(ArraySeq.scala:75)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.createConstructorDescriptors(Reflector.scala:157)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.constructorsAndCompanion(Reflector.scala:136)
at org.json4s.reflect.Reflector$ClassDescriptorBuilder.result(Reflector.scala:220)
at org.json4s.reflect.Reflector$.createDescriptorWithFormats(Reflector.scala:61)
at org.json4s.reflect.Reflector$.$anonfun$describeWithFormats$1(Reflector.scala:52)
at org.json4s.reflect.package$Memo.apply(package.scala:36)
at org.json4s.reflect.Reflector$.describeWithFormats(Reflector.scala:52)
at org.json4s.Extraction$.decomposeObject$1(Extraction.scala:126)
at org.json4s.Extraction$.internalDecomposeWithBuilder(Extraction.scala:241)
at org.json4s.Extraction$.decomposeWithBuilder(Extraction.scala:70)
at org.json4s.Extraction$.decompose(Extraction.scala:255)
at org.json4s.jackson.Serialization$.write(Serialization.scala:22)
at task1$.main(task1.scala:51)
at task1.main(task1.scala)
Here is my build.sbt
name := "hw1"
ThisBuild / version := "0.1"
ThisBuild / scalaVersion := "2.12.15"
libraryDependencies += "org.apache.spark" %% "spark-core" % "3.1.2"

Spark Scala - compile errors

I have a script in scala, when I run it in Zeppelin works well, but when I try compile with sbt, it doesnt work. I believe is something related to the versions but Im not being able to identify.
Those three ways returns the same error:
val catMap = catDF.rdd.map((row: Row) => (row.getAs[String](1)->row.getAs[Integer](0))).collect.toMap
val catMap = catDF.select($"description", $"id".cast("int")).as[(String, Int)].collect.toMap
val catMap = catDF.rdd.map((row: Row) => (row.getAs[String](1)->row.getAs[Integer](0))).collectAsMap()
Returning an error: "value rdd is not a member of Unit"
val bizCat = bizCatRDD.rdd.map(t => (t.getAs[String](0),catMap(t.getAs[String](1)))).toDF
Returning an error: "value toDF is not a member of org.apache.spark.rdd.RDD[U]"
Scala version: 2.12
Sbt Version: 1.3.13
UPDATE:
The whole class is:
package importer
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import udf.functions._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.Column
object BusinessImporter extends Importer{
def importa(spark: SparkSession, inputDir: String): Unit = {
import spark.implicits._
val bizDF = spark.read.json(inputDir).cache
// categories
val explode_categories = bizDF.withColumn("categories", explode(split(col("categories"), ",")))
val sort_categories = explode_categories.select(col("categories").as("description"))
.distinct
.coalesce(1)
.orderBy(asc("categories"))
// Create sequence column
val windowSpec = Window.orderBy("description")
val categories_with_sequence = sort_categories.withColumn("id",row_number.over(windowSpec))
val categories = categories_with_sequence.select("id","description")
val catDF = categories.write.insertInto("categories")
// business categories
//val catMap = catDF.rdd.map((row: Row) => (row.getAs[String](1)->row.getAs[Integer](0))).collect.toMap
//val catMap = catDF.select($"description", $"id".cast("int")).as[(String, Int)].collect.toMap
val catMap = catDF.rdd.map((row: Row) => (row.getAs[String](1)->row.getAs[Integer](0))).collectAsMap()
val auxbizCatRDD = bizDF.withColumn("categories", explode(split(col("categories"), ",")))
val bizCatRDD = auxbizCatRDD.select("business_id","categories")
val bizCat = bizCatRDD.rdd.map(t => (t.getAs[String](0),catMap(t.getAs[String](1)))).toDF
bizCat.write.insertInto("business_category")
// Business
val businessDF = bizDF.select("business_id","categories","city","address","latitude","longitude","name","is_open","review_count","stars","state")
businessDF.coalesce(1).write.insertInto("business")
// Hours
val bizHoursDF = bizDF.select("business_id","hours.Sunday","hours.Monday","hours.Tuesday","hours.Wednesday","hours.Thursday","hours.Friday","hours.Saturday")
val bizHoursDF_structs = bizHoursDF
.withColumn("Sunday",struct(
split(col("Sunday"),"-").getItem(0).as("Open"),
split(col("Sunday"),"-").getItem(1).as("Close")))
.withColumn("Monday",struct(
split(col("Monday"),"-").getItem(0).as("Open"),
split(col("Monday"),"-").getItem(1).as("Close")))
.withColumn("Tuesday",struct(
split(col("Tuesday"),"-").getItem(0).as("Open"),
split(col("Tuesday"),"-").getItem(1).as("Close")))
.withColumn("Wednesday",struct(
split(col("Wednesday"),"-").getItem(0).as("Open"),
split(col("Wednesday"),"-").getItem(1).as("Close")))
.withColumn("Thursday",struct(
split(col("Thursday"),"-").getItem(0).as("Open"),
split(col("Thursday"),"-").getItem(1).as("Close")))
.withColumn("Friday",struct(
split(col("Friday"),"-").getItem(0).as("Open"),
split(col("Friday"),"-").getItem(1).as("Close")))
.withColumn("Saturday",struct(
split(col("Saturday"),"-").getItem(0).as("Open"),
split(col("Saturday"),"-").getItem(1).as("Close")))
bizHoursDF_structs.coalesce(1).write.insertInto("business_hour")
}
def singleSpace(col: Column): Column = {
trim(regexp_replace(col, " +", " "))
}
}
sbt file:
name := "yelp-spark-processor"
version := "1.0"
scalaVersion := "2.12.12"
libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "3.0.1"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "3.0.1"
libraryDependencies += "org.apache.spark" % "spark-hive_2.12" % "3.0.1"
Can someone pls give me some orientations about what is wrong?
Many Thanks
Xavy
The issue here is that in scala this line returns type Unit:
val catDF = categories.write.insertInto("categories")
Unit in scala is like void in java, it's returned by functions that don't return anything meaningful. So basically at this point catDF is not a dataframe and you can't treat it as such. So you probably want to keep using categories instead of catDF in the lines that follow.

About an error accessing a field inside Tuple2

i am trying to access to a field within a Tuple2 and compiler is returning me an error. The software tries to push a case class within a kafka topic, then i want to recover it using spark streaming so i can feed a machine learning algorithm and save results within a mongo instance.
Solved!
I finally solved my problem, i am going to post the final solution:
This is the github project:
https://github.com/alonsoir/awesome-recommendation-engine/tree/develop
build.sbt
name := "my-recommendation-spark-engine"
version := "1.0-SNAPSHOT"
scalaVersion := "2.10.4"
val sparkVersion = "1.6.1"
val akkaVersion = "2.3.11" // override Akka to be this version to match the one in Spark
libraryDependencies ++= Seq(
"org.apache.kafka" % "kafka_2.10" % "0.8.1"
exclude("javax.jms", "jms")
exclude("com.sun.jdmk", "jmxtools")
exclude("com.sun.jmx", "jmxri"),
//not working play module!! check
//jdbc,
//anorm,
//cache,
// HTTP client
"net.databinder.dispatch" %% "dispatch-core" % "0.11.1",
// HTML parser
"org.jodd" % "jodd-lagarto" % "3.5.2",
"com.typesafe" % "config" % "1.2.1",
"com.typesafe.play" % "play-json_2.10" % "2.4.0-M2",
"org.scalatest" % "scalatest_2.10" % "2.2.1" % "test",
"org.twitter4j" % "twitter4j-core" % "4.0.2",
"org.twitter4j" % "twitter4j-stream" % "4.0.2",
"org.codehaus.jackson" % "jackson-core-asl" % "1.6.1",
"org.scala-tools.testing" % "specs_2.8.0" % "1.6.5" % "test",
"org.apache.spark" % "spark-streaming-kafka_2.10" % "1.6.1" ,
"org.apache.spark" % "spark-core_2.10" % "1.6.1" ,
"org.apache.spark" % "spark-streaming_2.10" % "1.6.1",
"org.apache.spark" % "spark-sql_2.10" % "1.6.1",
"org.apache.spark" % "spark-mllib_2.10" % "1.6.1",
"com.google.code.gson" % "gson" % "2.6.2",
"commons-cli" % "commons-cli" % "1.3.1",
"com.stratio.datasource" % "spark-mongodb_2.10" % "0.11.1",
// Akka
"com.typesafe.akka" %% "akka-actor" % akkaVersion,
"com.typesafe.akka" %% "akka-slf4j" % akkaVersion,
// MongoDB
"org.reactivemongo" %% "reactivemongo" % "0.10.0"
)
packAutoSettings
//play.Project.playScalaSettings
Kafka Producer
package example.producer
import play.api.libs.json._
import example.utils._
import scala.concurrent.Future
import example.model.{AmazonProductAndRating,AmazonProduct,AmazonRating}
import example.utils.AmazonPageParser
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
/**
args(0) : productId
args(1) : userdId
Usage: ./amazon-producer-example 0981531679 someUserId 3.0
*/
object AmazonProducerExample {
def main(args: Array[String]): Unit = {
val productId = args(0).toString
val userId = args(1).toString
val rating = args(2).toDouble
val topicName = "amazonRatingsTopic"
val producer = Producer[String](topicName)
//0981531679 is Scala Puzzlers...
AmazonPageParser.parse(productId,userId,rating).onSuccess { case amazonRating =>
//Is this the correct way? the best performance? possibly not, what about using avro or parquet? How can i push data in avro or parquet format?
//You can see that i am pushing json String to kafka topic, not raw String, but is there any difference?
//of course there are differences...
producer.send(Json.toJson(amazonRating).toString)
//producer.send(amazonRating.toString)
println("amazon product with rating sent to kafka cluster..." + amazonRating.toString)
System.exit(0)
}
}
}
This is the definition of necessary case classes (UPDATED), the file is named models.scala:
package example.model
import play.api.libs.json.Json
import reactivemongo.bson.Macros
case class AmazonProduct(itemId: String, title: String, url: String, img: String, description: String)
case class AmazonRating(userId: String, productId: String, rating: Double)
case class AmazonProductAndRating(product: AmazonProduct, rating: AmazonRating)
// For MongoDB
object AmazonRating {
implicit val amazonRatingHandler = Macros.handler[AmazonRating]
implicit val amazonRatingFormat = Json.format[AmazonRating]
//added using #Yuval tip
lazy val empty: AmazonRating = AmazonRating("-1", "-1", -1d)
}
This is the full code of the spark streaming process:
package example.spark
import java.io.File
import java.util.Date
import play.api.libs.json._
import com.google.gson.{Gson,GsonBuilder, JsonParser}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import com.mongodb.casbah.Imports._
import com.mongodb.QueryBuilder
import com.mongodb.casbah.MongoClient
import com.mongodb.casbah.commons.{MongoDBList, MongoDBObject}
import reactivemongo.api.MongoDriver
import reactivemongo.api.collections.default.BSONCollection
import reactivemongo.bson.BSONDocument
import org.apache.spark.streaming.kafka._
import kafka.serializer.StringDecoder
import example.model._
import example.utils.Recommender
/**
* Collect at least the specified number of json amazon products in order to feed recomedation system and feed mongo instance with results.
Usage: ./amazon-kafka-connector 127.0.0.1:9092 amazonRatingsTopic
on mongo shell:
use alonsodb;
db.amazonRatings.find();
*/
object AmazonKafkaConnector {
private var numAmazonProductCollected = 0L
private var partNum = 0
private val numAmazonProductToCollect = 10000000
//this settings must be in reference.conf
private val Database = "alonsodb"
private val ratingCollection = "amazonRatings"
private val MongoHost = "127.0.0.1"
private val MongoPort = 27017
private val MongoProvider = "com.stratio.datasource.mongodb"
private val jsonParser = new JsonParser()
private val gson = new GsonBuilder().setPrettyPrinting().create()
private def prepareMongoEnvironment(): MongoClient = {
val mongoClient = MongoClient(MongoHost, MongoPort)
mongoClient
}
private def closeMongoEnviroment(mongoClient : MongoClient) = {
mongoClient.close()
println("mongoclient closed!")
}
private def cleanMongoEnvironment(mongoClient: MongoClient) = {
cleanMongoData(mongoClient)
mongoClient.close()
}
private def cleanMongoData(client: MongoClient): Unit = {
val collection = client(Database)(ratingCollection)
collection.dropCollection()
}
def main(args: Array[String]) {
// Process program arguments and set properties
if (args.length < 2) {
System.err.println("Usage: " + this.getClass.getSimpleName + " <brokers> <topics>")
System.exit(1)
}
val Array(brokers, topics) = args
println("Initializing Streaming Spark Context and kafka connector...")
// Create context with 2 second batch interval
val sparkConf = new SparkConf().setAppName("AmazonKafkaConnector")
.setMaster("local[4]")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
sc.addJar("target/scala-2.10/blog-spark-recommendation_2.10-1.0-SNAPSHOT.jar")
val ssc = new StreamingContext(sparkConf, Seconds(2))
//this checkpointdir should be in a conf file, for now it is hardcoded!
val streamingCheckpointDir = "/Users/aironman/my-recommendation-spark-engine/checkpoint"
ssc.checkpoint(streamingCheckpointDir)
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
println("Initialized Streaming Spark Context and kafka connector...")
//create recomendation module
println("Creating rating recommender module...")
val ratingFile= "ratings.csv"
val recommender = new Recommender(sc,ratingFile)
println("Initialized rating recommender module...")
//THIS IS THE MOST INTERESTING PART AND WHAT I NEED!
//THE SOLUTION IS NOT PROBABLY THE MOST EFFICIENT, BECAUSE I HAD TO
//USE DATAFRAMES, ARRAYs and SEQs BUT IS FUNCTIONAL!
try{
messages.foreachRDD(rdd => {
val count = rdd.count()
if (count > 0){
val json= rdd.map(_._2)
val dataFrame = sqlContext.read.json(json) //converts json to DF
val myRow = dataFrame.select(dataFrame("userId"),dataFrame("productId"),dataFrame("rating")).take(count.toInt)
println("myRow is: " + myRow)
val myAmazonRating = AmazonRating(myRow(0).getString(0), myRow(0).getString(1), myRow(0).getDouble(2))
println("myAmazonRating is: " + myAmazonRating.toString)
val arrayAmazonRating = Array(myAmazonRating)
//this method needs Seq[AmazonRating]
recommender.predictWithALS(arrayAmazonRating.toSeq)
}//if
})
}catch{
case e: IllegalArgumentException => {println("illegal arg. exception")};
case e: IllegalStateException => {println("illegal state exception")};
case e: ClassCastException => {println("ClassCastException")};
case e: Exception => {println(" Generic Exception")};
}finally{
println("Finished taking data from kafka topic...")
}
ssc.start()
ssc.awaitTermination()
println("Finished!")
}
}
Thank you all, folks, #Yuval, #Emecas and #Riccardo.cardin.
Recommender.predict signature method looks like:
def predict(ratings: Seq[AmazonRating]) = {
// train model
val myRatings = ratings.map(toSparkRating)
val myRatingRDD = sc.parallelize(myRatings)
val startAls = DateTime.now
val model = ALS.train((sparkRatings ++ myRatingRDD).repartition(NumPartitions), 10, 20, 0.01)
val myProducts = myRatings.map(_.product).toSet
val candidates = sc.parallelize((0 until productDict.size).filterNot(myProducts.contains))
// get ratings of all products not in my history ordered by rating (higher first) and only keep the first NumRecommendations
val myUserId = userDict.getIndex(MyUsername)
val recommendations = model.predict(candidates.map((myUserId, _))).collect
val endAls = DateTime.now
val result = recommendations.sortBy(-_.rating).take(NumRecommendations).map(toAmazonRating)
val alsTime = Seconds.secondsBetween(startAls, endAls).getSeconds
println(s"ALS Time: $alsTime seconds")
result
}
//I think I've been as clear as possible, tell me if you need anything more and thanks for your patience teaching me #Yuval
Diagnosis
IllegalStateException suggests that you are operating over a StreamingContext that is already ACTIVE or STOPPED. see details here (lines 218-231)
java.lang.IllegalStateException: Adding new inputs, transformations, and output operations after starting a context is not supported
Code Review
By observing your code AmazonKafkaConnector , you are doing map, filter and foreachRDD into another foreachRDD over the same DirectStream object called : messages
General Advice:
Be functional my friend, by dividing your logic in small pieces for each one of the tasks you want to perform:
Streaming
ML Recommendation
Persistence
etc.
That will help you to understand and debug easier the Spark pipeline that you want to implement.
The problem is that the statement rdd.take(count.toInt) return an Array[T], as stated here
def take(num: Int): Array[T]
Take the first num elements of the RDD.
You're saying to your RDD to take the first n elements in it. Then, differently from what you guess, you've not a object of type Tuple2, but an array.
If you want to print each element of the array, you can use the method mkString defined on the Array type to obtain a single String with all the elements of the array.
It looks like what you're trying to do is is simply a map over a DStream. A map operation is a projection from type A to type B, where A is a String (that you're receiving from Kafka), and B is your case class AmazonRating.
Let's add an empty value to your AmazonRating:
case class AmazonRating(userId: String, productId: String, rating: Double)
object AmazonRating {
lazy val empty: AmazonRating = AmazonRating("-1", "-1", -1d)
}
Now let's parse the JSONs:
val messages = KafkaUtils
.createDirectStream[String, String, StringDecoder, StringDecoder]
(ssc, kafkaParams, topicsSet)
messages
.map { case (_, jsonRating) =>
val format = Json.format[AmazonRating]
val jsValue = Json.parse(record)
format.reads(jsValue) match {
case JsSuccess(rating, _) => rating
case JsError(_) => AmazonRating.empty
}
.filter(_ != AmazonRating.empty)
.foreachRDD(_.foreachPartition(it => recommender.predict(it.toSeq)))

spark unit testing with dataframe : Collect return empty array

Im using spark and i've been struggling to make a simple unit test pass with a Dataframe and Spark SQL.
Here is the snippet code :
class TestDFSpec extends SharedSparkContext {
"Test DF " should {
"pass equality" in {
val createDF = sqlCtx.createDataFrame(createsRDD,classOf[Test]).toDF()
createDF.registerTempTable("test")
sqlCtx.sql("select * FROM test").collectAsList() === List(Row(Test.from(create1)),Row(Test.from(create2)))
}
}
val create1 = "4869215,bbbbb"
val create2 = "4869215,aaaaa"
val createsRDD = sparkContext.parallelize(Seq(create1,create2)).map(Test.from)
}
I copy code from spark github and add some small changes to provide a SQLContext :
trait SharedSparkContext extends Specification with BeforeAfterAll {
import net.lizeo.bi.spark.conf.JobConfiguration._
#transient private var _sql: SQLContext = _
def sqlCtx: SQLContext = _sql
override def beforeAll() {
println(sparkConf)
_sql = new SQLContext(sparkContext)
}
override def afterAll() {
sparkContext.stop()
_sql = null
}
}
Model Test is pretty simple :
case class Test(key:Int, value:String)
object Test {
def from(line:String):Test = {
val f = line.split(",")
Test(f(0).toInt,f(1))
}
}
The job configuration object :
object JobConfiguration {
val conf = ConfigFactory.load()
val sparkName = conf.getString("spark.name")
val sparkMaster = conf.getString("spark.master")
lazy val sparkConf = new SparkConf()
.setAppName(sparkName)
.setMaster(sparkMaster)
.set("spark.executor.memory",conf.getString("spark.executor.memory"))
.set("spark.io.compression.codec",conf.getString("spark.io.compression.codec"))
val sparkContext = new SparkContext(sparkConf)
}
I'm using Spark 1.3.0 with Spec2. The exact dependencies from my sbt project files are :
object Dependencies {
private val sparkVersion = "1.3.0"
private val clouderaVersion = "5.4.4"
private val sparkClouderaVersion = s"$sparkVersion-cdh$clouderaVersion"
val sparkCdhDependencies = Seq(
"org.apache.spark" %% "spark-core" % sparkClouderaVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkClouderaVersion % "provided"
)
}
The test output is :
[info] TestDFSpec
[info]
[info] Test DF should
[error] x pass equality
[error] '[[], []]'
[error]
[error] is not equal to
[error]
[error] List([Test(4869215,bbbbb)], [Test(4869215,aaaaa)]) (TestDFSpec.scala:17)
[error] Actual: [[], []] [error] Expected: List([Test(4869215,bbbbb)], [Test(4869215,aaaaa)])
sqlCtx.sql("select * FROM test").collectAsList() return [[], []]
Any help would be greatly appreciated. I didn't meet any problem testing with RDD
I do want to migrate from RDD to Dataframe and be able to use Parquet directly from Spark to store files
Thanks in advance
The test pass with the following code :
class TestDFSpec extends SharedSparkContext {
import sqlCtx.implicits._
"Test DF " should {
"pass equality" in {
val createDF = sqlCtx.createDataFrame(Seq(create1,create2).map(Test.from))
createDF.registerTempTable("test")
val result = sqlCtx.sql("select * FROM test").collect()
result === Array(Test.from(create1),Test.from(create2)).map(Row.fromTuple)
}
}
val create1 = "4869215,bbbbb"
val create2 = "4869215,aaaaa"
}
The main difference is the way the DataFrame is created : from a Seq[Test] instead of RDD[Test]
I asked some explanation on spark mailing :http://apache-spark-user-list.1001560.n3.nabble.com/Unit-testing-dataframe-td24240.html#none

embedmongo with reactivemongo process does not exit

I am trying to do some tests using ScalaTest + embedmongo + reactivemongo but I fail. My first problem is that after a test mongod process does not shut down, I have this message in console:
INFO: stopOrDestroyProcess: process has not exited
and tests are paused till I kill the process manually. That happens even if body of my test is empty. I am running windows 8.1.
The other issue is, that when I try to connect to db inside test using reactive mongo and insert anything to db I get this exception:
reactivemongo.core.errors.ConnectionNotInitialized: MongoError['Connection is missing metadata (like protocol version, etc.) The connection pool is probably being initialized.']
I have literally no idea how to set it up. Here is my test code:
package model
import com.github.simplyscala.MongoEmbedDatabase
import org.scalatest.{OptionValues, Matchers, BeforeAndAfter, FlatSpec}
import reactivemongo.api.MongoDriver
import reactivemongo.api.collections.bson.BSONCollection
import scala.concurrent.duration._
import reactivemongo.bson.BSONDocument
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Await
class MongoBookingRepositoryTest extends FlatSpec
with Matchers
with OptionValues
with MongoEmbedDatabase
with BeforeAndAfter {
"A MongoBookingRepository" should "..." in withEmbedMongoFixture(port = 12345) { mongoProps =>
val driver = new MongoDriver
val connection = driver.connection("localhost:12345" :: Nil)
val db = connection("testDatabase")
val collection = db.collection[BSONCollection]("bookings")
val future = collection.insert(BSONDocument("a" -> 5))
println(Await.result(future, 3.seconds))
driver.close()
connection.close()
}
}
In play2.4 I tried as below and it worked.
dependencies:
"org.reactivemongo" %% "play2-reactivemongo" % "0.11.7.play24",
"org.reactivemongo" %% "reactivemongo-extensions-json" % "0.11.7.play24",
// test
"org.scalatest" %% "scalatest" % "2.2.4" % Test,
"de.flapdoodle.embed" % "de.flapdoodle.embed.mongo" % "1.50.0" % Test,
"org.mockito" % "mockito-core" % "1.10.19" % Test
TestMongoSetup:
import de.flapdoodle.embed.mongo.config.{Net, MongodConfigBuilder}
import de.flapdoodle.embed.mongo.distribution.Version
import de.flapdoodle.embed.mongo.{MongodStarter, MongodProcess, MongodExecutable}
import de.flapdoodle.embed.process.runtime.Network
import org.mockito.Mockito._
import play.modules.reactivemongo.ReactiveMongoApi
import reactivemongo.api.MongoConnection.ParsedURI
import reactivemongo.api.{MongoConnectionOptions, MongoDriver, MongoConnection}
import scala.concurrent.ExecutionContext
trait TestMongoSetup {
private var port : Int = _
private var mongodExe: MongodExecutable = _
private var mongodProcess: MongodProcess = _
def start(intiAtPort: Int): Unit = {
port=intiAtPort
mongodExe = MongodStarter.getDefaultInstance.prepare(
new MongodConfigBuilder()
.version(Version.Main.V3_0)
.net(new Net(port, Network.localhostIsIPv6()))
.build()
)
mongodProcess = mongodExe.start()
}
def stop(): Unit = {
mongodProcess.stop()
mongodExe.stop()
}
def createConnection(): MongoConnection = {
val driver = new MongoDriver
driver.connection(ParsedURI(
hosts = List(("localhost", port)),
options = MongoConnectionOptions(),
ignoredOptions = List.empty[String],
db = None,
authenticate = None
))
}
def createMockedReactiveMongoApi(dbName: String)(implicit ctx: ExecutionContext): ReactiveMongoApi = {
val connection = createConnection()
val db = connection(dbName)
val api = mock(classOf[ReactiveMongoApi])
doReturn(db).when(api).db
doReturn(connection).when(api).connection
api
}
}
TestClass:
import db.TestMongoSetup
import models.dao.UserDAO
import org.scalatest._
import play.modules.reactivemongo.ReactiveMongoApi
import scala.concurrent.Await
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration.Duration
class UserServiceTest extends FlatSpec with ShouldMatchers with GivenWhenThen with BeforeAndAfterAll with TestMongoSetup { private var mockedAPI: ReactiveMongoApi = _
var dao: UserDAO = _
val port : Int = 12345
override def beforeAll(): Unit = {
start(port)
mockedAPI = createMockedReactiveMongoApi("testDB")
dao = new UserDAO(mockedAPI)
}
override def afterAll(): Unit = {
mockedAPI.connection.actorSystem.shutdown()
mockedAPI.connection.actorSystem.awaitTermination()
stop()
}
"Check" should "check User object into DB" in {
Given("a user info")
val email = "xyx#abc.com"
val pwd= "abcddd"
When("it fetch from DB")
val fromDBUser = Await.result(dao.fetch(email,pwd), Duration.Inf)
Then("it should be fetched")
fromDBUser.get.email should equal(email)
}
}
withEmbedMongoFixture doesn't work very well.
Prefer use the "classic" way : https://github.com/SimplyScala/scalatest-embedmongo#basic-usage-mutable-way
For Mongo 3 use 2.3_SNAPSHOT version of the library.