Is it possible to serialize non case classes in Scala? - scala

Is it possible to serialize object of below class using Json4s or lift or any other library?
class User(uId: Int) extends Serializable {
var id: Int = uId
var active: Boolean = false
var numTweets: Int = 0
var followers: ArrayBuffer[Int] = null
var following: ArrayBuffer[Int] = null
var userTimeline: Queue[String] = null
var homeTimeline: Queue[String] = null
var userTimelineSize: Int = 0
var homeTimelineSize: Int = 0
//var notifications: Queue[String] = null
var mentions: Queue[String] = null
var directMessages: Queue[String] = null
}

You can use Json4s for this purpose (with help of FieldSerializer), below is the code to get started with serialization of the User object:
def main(args: Array[String]) {
import org.json4s._
import org.json4s.native.Serialization
import org.json4s.native.Serialization.{read, write, writePretty}
implicit val formats = DefaultFormats + FieldSerializer[User]()
val user = new User(12)
val json = write(user)
println(writePretty(user))
}
Also, in your non case class anything which is missing from the JSON needs to be an Option.
Another method would be to go for Genson:
def main(args: Array[String]) {
import com.owlike.genson._
import com.owlike.genson.ext.json4s._
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.JsonAST._
object CustomGenson {
val genson = new ScalaGenson(
new GensonBuilder()
.withBundle(ScalaBundle(), Json4SBundle())
.create()
)
}
// then just import it in the places you want to use this instance instead of the default one
import CustomGenson.genson._
val user = new User(12)
val jsonArray = toJson(user)
println(jsonArray)
}

Related

How to build uber jar for Spark Structured Streaming application to MongoDB sink

I am unable to build a fat jar for my Kafka-SparkStructuredStreaming-MongoDB pipeline.
I have built StructuredStreamingProgram: receives streaming data from Kafka Topics and apply some parsing and then my intention is to save the structured streaming data into a MongoDB collection.
I have followed this article to build my pipeline https://learningfromdata.blog/2017/04/16/real-time-data-ingestion-with-apache-spark-structured-streaming-implementation/
I have created Helpers.scala and MongoDBForeachWriter.scala as suggested in the article for my streaming pipeline and save it under src/main/scala/example
When i do sbt assembly to build a fat jar i face this errors;
"[error] C:\spark_streaming\src\main\scala\example\structuredStreamApp.scala:63: class MongoDBForeachWriter is abstract; cannot be instantiated
[error] val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)"
I need guidance in making this pipeline work.
Any help will be appreciated
package example
import java.util.Calendar
import org.apache.spark.util.LongAccumulator
import org.apache.spark.sql.Row
import org.apache.spark.sql.ForeachWriter
import org.mongodb.scala._
import org.mongodb.scala.bson.collection.mutable.Document
import org.mongodb.scala.bson._
import example.Helpers._
abstract class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc).results()
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
}
package example
import java.util.concurrent.TimeUnit
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import org.mongodb.scala._
object Helpers {
implicit class DocumentObservable[C](val observable: Observable[Document]) extends ImplicitObservable[Document] {
override val converter: (Document) => String = (doc) => doc.toJson
}
implicit class GenericObservable[C](val observable: Observable[C]) extends ImplicitObservable[C] {
override val converter: (C) => String = (doc) => doc.toString
}
trait ImplicitObservable[C] {
val observable: Observable[C]
val converter: (C) => String
def results(): Seq[C] = Await.result(observable.toFuture(), Duration(10, TimeUnit.SECONDS))
def headResult() = Await.result(observable.head(), Duration(10, TimeUnit.SECONDS))
def printResults(initial: String = ""): Unit = {
if (initial.length > 0) print(initial)
results().foreach(res => println(converter(res)))
}
def printHeadResult(initial: String = ""): Unit = println(s"${initial}${converter(headResult())}")
}
}
package example
import org.apache.spark.sql.functions.{col, _}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.util.LongAccumulator
import example.Helpers._
import java.util.Calendar
object StructuredStreamingProgram {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("OSB_Streaming_Model")
.getOrCreate()
import spark.implicits._
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "10.160.172.45:9092, 10.160.172.46:9092, 10.160.172.100:9092")
.option("subscribe", "TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT")
.load()
val dfs = df.selectExpr("CAST(value AS STRING)").toDF()
val data =dfs.withColumn("splitted", split($"SERVICE_NAME8", "/"))
.select($"splitted".getItem(4).alias("region"),$"splitted".getItem(5).alias("service"),col("_raw"))
.withColumn("service_type", regexp_extract($"service", """.*(Inbound|Outbound|Outound).*""",1))
.withColumn("region_type", concat(
when(col("region").isNotNull,col("region")).otherwise(lit("null")), lit(" "),
when(col("service").isNotNull,col("service_type")).otherwise(lit("null"))))
val extractedDF = data.filter(
col("region").isNotNull &&
col("service").isNotNull &&
col("_raw").isNotNull &&
col("service_type").isNotNull &&
col("region_type").isNotNull)
.filter("region != ''")
.filter("service != ''")
.filter("_raw != ''")
.filter("service_type != ''")
.filter("region_type != ''")
// sends to MongoDB once every 20 seconds
val mongodb_uri = "mongodb://dstk8sdev06.us.dell.com/?maxPoolSize=1"
val mdb_name = "HANZO_MDB"
val mdb_collection = "Testing_Spark"
val CountAccum: LongAccumulator = spark.sparkContext.longAccumulator("mongostreamcount")
val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)
val query = df.writeStream
.foreach(structuredStreamForeachWriter)
.trigger(Trigger.ProcessingTime("20 seconds"))
.start()
while (!spark.streams.awaitAnyTermination(60000)) {
println(Calendar.getInstance().getTime()+" :: mongoEventsCount = "+CountAccum.value)
}
}
}
with the above by doing corrections i would need to be able to save the structured streaming data into mongodb
You can instantiate object for abstract class. To resolve this issue, implement close function in MongoDBForeachWriter class and make it as as concrete class.
class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc)
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
override def close(errorOrNull: Throwable): Unit = {
if(mongoClient != null) {
Try {
mongoClient.close()
}
}
}
}
Hope this helps.
Ravi

Connect to Amazon account using Scala

I want to connect to my Amazon account in order to delete resources inside my s3 storage.
I have the access key and secret key, and this is how I started to build my connection to Amazon:
def connectToAmaozn(): Unit = {
val AWS_ACCESS_KEY=conf.getString("WebRecorder.PushSession.AccessKey")
val AWS_SECRET_KEY=conf.getString("WebRecorder.PushSession.SecretKey")
val AWSCredentials = new BasicAWSCredentials(AWS_ACCESS_KEY,AWS_SECRET_KEY)
}
Can you elaborate on how I may do this?
I used this solution to get bucket name and number of objects:
import scala.collection.JavaConversions._
import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
import com.amazonaws.services.s3
import com.amazonaws.services.s3.model.{GetObjectTaggingRequest, ObjectListing, S3ObjectSummary}
import com.amazonaws.services.s3.{AmazonS3Client, AmazonS3ClientBuilder}
import com.clicktale.pipeline.framework.dal.ConfigParser.conf
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model._
import scala.language.postfixOps
class Amazon {
val AWS_ACCESS_KEY = conf.getString("WebRecorder.PushSession.AccessKey")
val AWS_SECRET_KEY = conf.getString("WebRecorder.PushSession.SecretKey")
val bucketName = "nv-q-s3-assets-01"
val provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(AWS_ACCESS_KEY, AWS_SECRET_KEY))
val client = AmazonS3ClientBuilder.standard().withCredentials(provider).withRegion("us-east-1").build()
// def connectToAmazon(): Unit = {
//
// val provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(AWS_ACCESS_KEY, AWS_SECRET_KEY))
// val client = AmazonS3ClientBuilder.standard().withCredentials(provider).withRegion("us-east-1").build()
def removeObjectsFromBucket(){
println("Removing objects from bucket...")
var object_listing: ObjectListing = client.listObjects(bucketName)
var flag: Boolean = true
while (flag) {
val iterator: Iterator[_] = object_listing.getObjectSummaries.iterator()
while (iterator.hasNext) {
val summary: S3ObjectSummary = iterator.next().asInstanceOf[S3ObjectSummary]
client.deleteObject(bucketName, summary.getKey())
}
flag=false
}
}
def countNumberOfObjectsInsideBucket(): Unit ={
var object_listing: ObjectListing = client.listObjects(bucketName)
var flag: Boolean = true
var count=0
while (flag) {
val iterator: Iterator[_] = object_listing.getObjectSummaries.iterator()
while (iterator.hasNext) {
val summary: S3ObjectSummary = iterator.next().asInstanceOf[S3ObjectSummary]
count+=1
}
flag=false
println("Number of objects are: " + count)
}
}
}
You need a AWSCredentialsProvider:
val provider = new AWSStaticCredentialsProvider(
new BasicAWSCredentials(AWS_ACCESS_KEY,AWS_SECRET_KEY)
)
and then use it to create the client:
val client = AmazonS3ClientBuilder
.standard
.withCredentials(provider)
.withRegion("us-west-1") // or whatever your region is
.build

How to return model query result as JSON in scala play framework

I am using Play framework 2.1.1 with scala.I query a database table return to controller as list and then convert list to string and return to ajax call from javascript code.
How to return query result as json and return to ajax call throught controller?
Application.scala
import play.api._
import play.api.mvc._
import play.api.data._
import views.html._
import models._
object Application extends Controller {
def index = Action {
Ok(views.html.index())
}
def getSummaryTable = Action{
var sum="Summary Table";
Ok(ajax_result.render((Timesheet.getAll).mkString("\n")))
}
def javascriptRoutes = Action { implicit request =>
import routes.javascript._
Ok(
Routes.javascriptRouter("jsRoutes")(
// Routes
controllers.routes.javascript.Application.getSummaryTable
)
).as("text/javascript")
}
}
TimeSheet.scala
// Use PostgresDriver to connect to a Postgres database
import scala.slick.driver.PostgresDriver.simple._
import scala.slick.lifted.{MappedTypeMapper,BaseTypeMapper,TypeMapperDelegate}
import scala.slick.driver.BasicProfile
import scala.slick.session.{PositionedParameters,PositionedResult}
// Use the implicit threadLocalSession
import Database.threadLocalSession
import java.sql.Date
import java.sql.Time
case class Timesheet(ID: Int, dateVal: String, entryTime: Time, exitTime: Time, someVal: String)
object Timesheet {
//Definition of Timesheet table
// object TS extends Table[(Int,String,Time,Time,String)]("timesheet"){
val TSTable = new Table[Timesheet]("timesheet"){
def ID = column[Int]("id")
def dateVal = column[String]("date")
def entryTime = column[Time]("entry_time")
def exitTime = column[Time]("exit_time")
def someVal = column[String]("someval")
def * = ID ~ dateVal ~ entryTime ~ exitTime ~ someVal <> (Timesheet.apply _, Timesheet.unapply _)
}
def getAll: Seq[Timesheet] = {
Database.forURL("jdbc:postgresql://localhost:5432/my_db", "postgres", "password",null, driver="org.postgresql.Driver") withSession{
val q = Query(TSTable)
val qFiltered = q.filter(_.ID === 41 )
val qDateFilter = qFiltered.filter(_.dateVal === "01/03/2013")
val qSorted = qDateFilter.sortBy(_.entryTime)
qSorted.list
}
}
}
Also, don't forget to provide an implicit (or not) Json deserializer for your model, otherwise, Scala compiler will yell at you :-). You can do something like :
def allTimesheet = Action {
val timesheetWrites = Json.writes[Timesheet] // here it's the deserializer
val listofTimeSheet = Timesheet.getAll
Ok( Json.toJson( listofTimeSheet )( timesheetWrites ) )
}
or you can use implicits like :
def allTimesheet = Action {
implicit val timesheetWrites = Json.writes[Timesheet] // here it's the deserializer
val listofTimeSheet = Timesheet.getAll
Ok( Json.toJson( listofTimeSheet ) )
}
and even declare your deserializer in your model companion object like :
companion object
object Timesheet {
implicit val timesheetWrites = Json.writes[Timesheet] // here it's the deserializer
....
}
and in the controller
import models.Timesheet.timesheetWrites
def allTimesheet = Action {
val listofTimeSheet = Timesheet.getAll
Ok( Json.toJson( listofTimeSheet ) )
}
I recommend you use play.api.libs.Json.toJson.
Here's an example:
object Products extends Controller {
def list = Action {
val productCodes = Product.findAll.map(_.ean)
Ok(Json.toJson(productCodes))
}
Json.toJson returns a JsValue for which Play will automatically add a application/json header.
See Play For Scala chapter 8.

Deserialize MongoDB Document with Scala and Jackson-Mapper leads to UnrecognizedProperty _id

I have the following class defined in Scala using Jackson as mapper.
package models
import play.api.Play.current
import org.codehaus.jackson.annotate.JsonProperty
import net.vz.mongodb.jackson.ObjectId
import play.modules.mongodb.jackson.MongoDB
import reflect.BeanProperty
import scala.collection.JavaConversions._
import net.vz.mongodb.jackson.Id
import org.codehaus.jackson.annotate.JsonIgnoreProperties
case class Team(
#BeanProperty #JsonProperty("teamName") var teamName: String,
#BeanProperty #JsonProperty("logo") var logo: String,
#BeanProperty #JsonProperty("location") var location: String,
#BeanProperty #JsonProperty("details") var details: String,
#BeanProperty #JsonProperty("formOfSport") var formOfSport: String)
object Team {
private lazy val db = MongoDB.collection("teams", classOf[Team], classOf[String])
def save(team: Team) { db.save(team) }
def getAll(): Iterable[Team] = {
val teams: Iterable[Team] = db.find()
return teams
}
def findOneByTeamName(teamName: String): Team = {
val team: Team = db.find().is("teamName", teamName).first
return team
}
}
Inserting into mongodb works without problems and an _id is automatically inserted for every document.
But now I want to try read (or deserialize) a document e.g. by calling findOneByTeamName. This always causes an UnrecognizedPropertyException for _id. I create the instance with Team.apply and Team.unapply. Even with an own ObjectId this doesn't work as _id and id are treated different.
Can anyone help how the get the instance or how to deserialize right? Thanks in advance
I am using play-mongojack. Here is my class. You object definition is fine.
import com.fasterxml.jackson.annotation.JsonProperty
import com.fasterxml.jackson.databind.ObjectMapper
import org.mongojack.{MongoCollection, JacksonDBCollection}
import org.mongojack.ObjectId
import org.mongojack.WriteResult
import com.mongodb.BasicDBObject
import scala.reflect.BeanProperty
import javax.persistence.Id
import javax.persistence.Transient
import java.util.Date
import java.util.List
import java.lang.{ Long => JLong }
import play.mongojack.MongoDBModule
import play.mongojack.MongoDBPlugin
import scala.collection.JavaConversions._
class Event (
#BeanProperty #JsonProperty("clientMessageId") val clientMessageId: Option[String] = None,
#BeanProperty #JsonProperty("conversationId") val conversationId: String
) {
#ObjectId #Id #BeanProperty var messageId: String = _ // don't manual set messageId
#BeanProperty #JsonProperty("uploadedFile") var uploadedFile: Option[(String, String, JLong)] = None // the upload file(url,name,size)
#BeanProperty #JsonProperty("createdDate") var createdDate: Date = new Date()
#BeanProperty #Transient var cmd: Option[(String, String)] = None // the cmd(cmd,param)
def createdDateStr() = {
val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
format.format(createdDate)
}
}

Sortable Columns In scala.swing.Table

Consider this example code:
import swing._
import Swing._
import javax.swing.JTable
import javax.swing.table.AbstractTableModel
class MyTable(columnNames: Seq[String], model: Seq[Seq[Any]]) extends Component {
override lazy val peer = new JTable(new AbstractTableModel {
def getValueAt(row: Int, col: Int): AnyRef = model(row)(col).asInstanceOf[AnyRef]
def getColumnCount() = columnNames.length
def getRowCount() = model.length
override def isCellEditable(row: Int, column: Int) = false
})
peer setAutoCreateRowSorter true
}
object SO extends SimpleSwingApplication {
implicit def tabelRowData2Array[T](rowData: Seq[Seq[T]]) = rowData.map(_.toArray[Any]).toArray
val rowData = Seq(Seq("1"), Seq("2"), Seq("3"))
val columnNames = Seq("Nr")
def top = new MainFrame {
title = "TableTest"
val scalaTable = new Table(rowData,columnNames) {
peer setAutoCreateRowSorter true
}
val myTable = new MyTable(columnNames,rowData)
contents = new BoxPanel(Orientation.Horizontal) {
contents += new ScrollPane(scalaTable)
contents += new ScrollPane(myTable)
}
}
}
Why are the columns in scalaTable not sortable when clicking on the column name while the columns in myTable are?
And how can I use scala.swing.Table with sortable columns instead of reimplementing it by MyTable?
See my answer to my own question at Using TableRowSorter with scala.swing.Table. The Java 6 table sorting feature isn't implemented in scala.swing.Table. The code is commented out.