Edgetriplets are not getting broadcast-ed properly - scala

I created a graph using graphx and now I need to extract sub-graphs from the original graph. In the following code I am trying to broadcast edgetriplets and filter it for each user-id.
class VertexProperty(val id:Long) extends Serializable
case class User(val userId:Long, var offset:Int, val userCode:String, val Name:String, val Surname:String, val organizational_unit:String, val UME:String, val person_type:String, val SOD_HIGH:String, val SOD_MEDIUM:String, val SOD_LOW:String, val Under_mitigated:String) extends VertexProperty(userId)
case class Account(val accountId:Long, var offset:Int, val userCode:String, val userId:String, val account_creation_date:String, var disabled:String, var forcechangepwd:String, var pwdlife:String, var numberloginerror:String, var lastchangepwd:String, var lastlogin:String, var lastwronglogin:String, var state:String, var expire:String, var last_cert_time:String, var creation_date:String, var creation_user:String,var challenge_counter:String, var challenge_failed_attempt:String) extends VertexProperty(accountId) //Check if userCode is actually the code in this example.
case class Application(var applicationId:Long, var offset:Int, var Name:String, var Description:String, var Target:String, var Owner:String, var Ownercode:String, var Creation_date:String, var Creation_user:String) extends VertexProperty(applicationId)
case class Entitlement(val entitlementId:Long, var offset:Int, val Name:String, var Code:String, var Description:String, var Type:String, var Application:String, var Administrative:String, var Parent_ID:String, var Owner_code:String, var Scope_type:String, var Business_name:String, var Business_policy:String, var SOD_high:String, var SOD_medium:String, var SOD_low:String) extends VertexProperty(entitlementId)
def compute_user_triplets(uId:String, bcast_triplets:Broadcast[Array[EdgeTriplet[VertexProperty,String]]]):ArrayBuffer[EdgeTriplet[VertexProperty, String]] = {
var user_triplets = ArrayBuffer[EdgeTriplet[VertexProperty, String]]()
var triplets = bcast_triplets.value
for(x <- triplets){
if(x.attr == uId){
user_triplets += x
}
}
return user_triplets
}
//Some code for computing vertexRDD and edges
val edges : RDD[Edge[String]] = sc.union(user_account_edges, account_application_edges, user_entitlement_edges)
val vertexRDD: RDD[(VertexId, VertexProperty)] = vertices.map(t => (t.id, t))
val graph: Graph[VertexProperty,String] = Graph(vertexRDD, edges, new VertexProperty(-1))
val triplets = graph.triplets
val temp = triplets.map(t => t.attr)
val distinct_users = temp.distinct.filter(t => t != "NULL")
val bcast_triplets = sc.broadcast(triplets.collect())
val users_triplets = distinct_users.map(uId => compute_user_triplets(uId, bcast_triplets))
But I get the error below after the last line of the code runs. Why am I getting this error?"
org.apache.spark.SparkException: Task not serializable

Related

Unable to Analyse data

val patterns = ctx.getBroadcastState(patternStateDescriptor)
The imports I made
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{MapStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.api.scala.typeutils.Types
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
Here's the code
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers","localhost:9092")
val patternStream = new FlinkKafkaConsumer010("patterns", new SimpleStringSchema, properties)
val patterns = env.addSource(patternStream)
var patternData = patterns.map {
str =>
val splitted_str = str.split(",")
PatternStream(splitted_str(0).trim, splitted_str(1).trim, splitted_str(2).trim)
}
val logsStream = new FlinkKafkaConsumer010("logs", new SimpleStringSchema, properties)
// logsStream.setStartFromEarliest()
val logs = env.addSource(logsStream)
var data = logs.map {
str =>
val splitted_str = str.split(",")
LogsTest(splitted_str.head.trim, splitted_str(1).trim, splitted_str(2).trim)
}
val keyedData: KeyedStream[LogsTest, String] = data.keyBy(_.metric)
val bcStateDescriptor = new MapStateDescriptor[Unit, PatternStream]("patterns", Types.UNIT, Types.of[PatternStream]) // first type defined is for the key and second data type defined is for the value
val broadcastPatterns: BroadcastStream[PatternStream] = patternData.broadcast(bcStateDescriptor)
val alerts = keyedData
.connect(broadcastPatterns)
.process(new PatternEvaluator())
alerts.print()
// println(alerts.getClass)
// val sinkProducer = new FlinkKafkaProducer010("output", new SimpleStringSchema(), properties)
env.execute("Flink Broadcast State Job")
}
class PatternEvaluator()
extends KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)] {
private lazy val patternStateDescriptor = new MapStateDescriptor("patterns", classOf[String], classOf[String])
private var lastMetricState: ValueState[String] = _
override def open(parameters: Configuration): Unit = {
val lastMetricDescriptor = new ValueStateDescriptor("last-metric", classOf[String])
lastMetricState = getRuntimeContext.getState(lastMetricDescriptor)
}
override def processElement(reading: LogsTest,
readOnlyCtx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#ReadOnlyContext,
out: Collector[(String, String, String)]): Unit = {
val metrics = readOnlyCtx.getBroadcastState(patternStateDescriptor)
if (metrics.contains(reading.metric)) {
val metricPattern: String = metrics.get(reading.metric)
val metricPatternValue: String = metrics.get(reading.value)
val lastMetric = lastMetricState.value()
val logsMetric = (reading.metric)
val logsValue = (reading.value)
if (logsMetric == metricPattern) {
if (metricPatternValue == logsValue) {
out.collect((reading.timestamp, reading.value, reading.metric))
}
}
}
}
override def processBroadcastElement(
update: PatternStream,
ctx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#Context,
out: Collector[(String, String, String)]
): Unit = {
val patterns = ctx.getBroadcastState(patternStateDescriptor)
if (update.metric == "IP") {
patterns.put(update.metric /*,update.operator*/ , update.value)
}
// else if (update.metric == "username"){
// patterns.put(update.metric, update.value)
// }
// else {
// println("No required data found")
// }
// }
}
}
Sample Data :- Logs Stream
"21/09/98","IP", "5.5.5.5"
Pattern Stream
"IP","==","5.5.5.5"
I'm unable to analyse data by getting desired result, i.e = 21/09/98,IP,5.5.5.5
There's no error as of now, it's just not analysing the data
The code is reading streams (Checked)
One common source of trouble in cases like this is that the API offers no control over the order in which the patterns and the data are ingested. It could be that processElement is being called before processBroadcastElement.

Scala How to Sort List of Objects List by Object field (Object.field)?

I have the following type of data:
case class TipoDeDato[T] (nombreCampo: String,valor: T)
And in my exercise, I need to create the following structure, using the type of data I mentioned:
So, I created the following structure
val registro0: List[TipoDeDato[_>: String with Int]] = List(
new TipoDeDato[String]("Autor", "Gabo"),
new TipoDeDato[String]("Titulo", "100 Años"),
new TipoDeDato[Int]("Numero de Paginas", 700)
)
val registro1: List[TipoDeDato[_>: String with Int]] = List(
new TipoDeDato[String]("Autor", "Gabo"),
new TipoDeDato[String]("Titulo", "Maria"),
new TipoDeDato[Int]("Numero de Paginas", 1200)
)
val registro2: List[TipoDeDato[_>: String with Int]] = List(
new TipoDeDato[String]("Autor", "Gabo"),
new TipoDeDato[String]("Titulo", "Carrasco"),
new TipoDeDato[Int]("Numero de Paginas", 150)
)
val registro3: List[TipoDeDato[_>: String with Int]] = List(
new TipoDeDato[String]("Autor", "Gabo"),
new TipoDeDato[String]("Titulo", "Oceano"),
new TipoDeDato[Int]("Numero de Paginas", 200)
)
And to create the "Libros" object, I have done the following:
val Libros: List[List[TipoDeDato[_>: String with Int]]] = List(registro0,registro1,registro2,registro3)
My question is, how can I sort the "Libros" object, by any of its components, "Autor", "Titulo", "Numero de paginas"?, is this structure adequate for what I need to do?
To sort List of List:
sealed trait TipoDeDato
case class Autor (autor: String) extends TipoDeDato
case class Titulo (titulo: String) extends TipoDeDato
case class NumeroDePaginas (numeroDePaginas: Int) extends TipoDeDato
class TablaItems(var registros: List[List[TipoDeDato]]){
def insertInto(reg: List[List[TipoDeDato]]): TablaItems = {
registros = registros ::: reg
this
}
}
val registro0: List[TipoDeDato] = List(
Autor("HGabo"),
Titulo("ZLa María"),
NumeroDePaginas(752)
)
val registro1: List[TipoDeDato] = List(
Autor("AGabo"),
Titulo("CLa María"),
NumeroDePaginas(521)
)
val Registros1: List[List[TipoDeDato]] = List(registro0)
val Registros2: List[List[TipoDeDato]] = List(registro1)
val tablaLibros = new TablaItems(Registros1)
tablaLibros.registros.foreach(println)
println("----")
tablaLibros.insertInto(Registros2)
tablaLibros.registros.foreach(println)
println("----")
tablaLibros.registros.sortBy(r=>r.collectFirst{
case NumeroDePaginas(n) => n
}.getOrElse(0))
Actually I think you need:
case class Dato(autor: String, titulo: String, numeroDePaginas: Int)
class TablaItems(var registros: List[Dato]){
def insertInto(reg: List[Dato]): TablaItems = {
registros = registros ::: reg
this
}
}
//you can also do (if you prefer) `Dato(author = "HGabo", titulo = "ZLa María", numeroDePaginas = 752)
val registro0 = Dato("HGabo", "ZLa María", 752)
val registro1 = Dato("AGabo", "CLa María", 521)
val Registros1: List[Dato] = List(registro0)
val Registros2: List[Dato] = List(registro1)
val tablaLibros = new TablaItems(Registros1)
tablaLibros.registros.foreach(println)
println("----")
tablaLibros.insertInto(Registros2)
tablaLibros.registros.foreach(println)
println("----")
tablaLibros.registros.sortBy(_.numeroDePaginas)
Also, if this problem requires functional programming (no side-effects, and also I got rid of OOP - however the latter is not mandatory, OOP and FP are orthogonal):
case class TablaItems(registros: List[Dato])
implicit class TablaItemsOperations(tabla: TablaItems){
def withData(reg: List[Dato]) = TablaItems(tabla.registos :: reg)
}
...
val tablaLibros = TablaItems(Registros1)
tablaLibros.registros.foreach(println)
println("----")
val tablaLibrosUpdated = tablaLibros.withData(Registros2)
tablaLibrosUpdated.registros.foreach(println)
println("----")
tablaLibrosUpdated.registros.sortBy(_.numeroDePaginas)

spark groupBy operation hangs at 199/200

I have a spark standalone cluster with master and two executors. I have an RDD[LevelOneOutput] and below is LevelOneOutput class
class LevelOneOutput extends Serializable {
#BeanProperty
var userId: String = _
#BeanProperty
var tenantId: String = _
#BeanProperty
var rowCreatedMonth: Int = _
#BeanProperty
var rowCreatedYear: Int = _
#BeanProperty
var listType1: ArrayBuffer[TypeOne] = _
#BeanProperty
var listType2: ArrayBuffer[TypeTwo] = _
#BeanProperty
var listType3: ArrayBuffer[TypeThree] = _
...
...
#BeanProperty
var listType18: ArrayBuffer[TypeEighteen] = _
#BeanProperty
var groupbyKey: String = _
}
Now I want to group this RDD based on userId, tenantId, rowCreatedMonth, rowCreatedYear. For that I did this
val levelOneRDD = inputRDD.map(row => {
row.setGroupbyKey(s"${row.getTenantId}_${row.getRowCreatedYear}_${row.getRowCreatedMonth}_${row.getUserId}")
row
})
val groupedRDD = levelOneRDD.groupBy(row => row.getGroupbyKey)
This gives me the data in key as String and value as Iterable[LevelOneOutput]
Now I want to generate one single object of LevelOneOutput for that group key. For that I was doing something like below:
val rdd = groupedRDD.map(row => {
val levelOneOutput = new LevelOneOutput
val groupKey = row._1.split("_")
levelOneOutput.setTenantId(groupKey(0))
levelOneOutput.setRowCreatedYear(groupKey(1).toInt)
levelOneOutput.setRowCreatedMonth(groupKey(2).toInt)
levelOneOutput.setUserId(groupKey(3))
var listType1 = new ArrayBuffer[TypeOne]
var listType2 = new ArrayBuffer[TypeTwo]
var listType3 = new ArrayBuffer[TypeThree]
...
...
var listType18 = new ArrayBuffer[TypeEighteen]
row._2.foreach(data => {
if (data.getListType1 != null) listType1 = listType1 ++ data.getListType1
if (data.getListType2 != null) listType2 = listType2 ++ data.getListType2
if (data.getListType3 != null) listType3 = listType3 ++ data.getListType3
...
...
if (data.getListType18 != null) listType18 = listType18 ++ data.getListType18
})
if (listType1.isEmpty) levelOneOutput.setListType1(null) else levelOneOutput.setListType1(listType1)
if (listType2.isEmpty) levelOneOutput.setListType2(null) else levelOneOutput.setListType2(listType2)
if (listType3.isEmpty) levelOneOutput.setListType3(null) else levelOneOutput.setListType3(listType3)
...
...
if (listType18.isEmpty) levelOneOutput.setListType18(null) else levelOneOutput.setListType18(listType18)
levelOneOutput
})
This is working as expected for small size of input, but when I try to run on the larger set of input data, group by operation is getting hang at 199/200 and I don't see any specific error or warning in stdout/stderr
Can some one point me why the job is not proceeding further...
Instead of using groupBy operation I have created paired RDD like below
val levelOnePairedRDD = inputRDD.map(row => {
row.setGroupbyKey(s"${row.getTenantId}_${row.getRowCreatedYear}_${row.getRowCreatedMonth}_${row.getUserId}")
(row.getGroupByKey, row)
})
and updated the processing logic, which solved my issue.

Generate keywords using Apache Spark and mllib

I wrote code like this:
val hashingTF = new HashingTF()
val tfv: RDD[Vector] = sparkContext.parallelize(articlesList.map { t => hashingTF.transform(t.words) })
tfv.cache()
val idf = new IDF().fit(tfv)
val rate: RDD[Vector] = idf.transform(tfv)
How to get top 5 keywords from the "rate" RDD for each articlesList item?
ADD:
articlesList contains objects:
case class ArticleInfo (val url: String, val author: String, val date: String, val keyWords: List[String], val words: List[String])
words contains all words from article.
I do not understand the structure of rate, in the documentation says:
#return an RDD of TF-IDF vectors
My solution is:
(articlesList, rate.collect()).zipped.foreach { (art,tfidf) =>
val keywords = new mutable.TreeSet[(String, Double)]
art.words.foreach { word =>
val wordHash = hashingTF.indexOf(word)
val wordTFIDF = tfidf.apply(wordHash)
if (keywords.size == KEYWORD_COUNT) {
val minimum = keywords.minBy(_._2)
if (minimum._2 < wordHash) {
keywords.remove(minimum)
keywords.add((word,wordTFIDF))
}
} else {
keywords.add((word,wordTFIDF))
}
}
art.keyWords = keywords.toList.map(_._1)
}

Is it possible to serialize non case classes in Scala?

Is it possible to serialize object of below class using Json4s or lift or any other library?
class User(uId: Int) extends Serializable {
var id: Int = uId
var active: Boolean = false
var numTweets: Int = 0
var followers: ArrayBuffer[Int] = null
var following: ArrayBuffer[Int] = null
var userTimeline: Queue[String] = null
var homeTimeline: Queue[String] = null
var userTimelineSize: Int = 0
var homeTimelineSize: Int = 0
//var notifications: Queue[String] = null
var mentions: Queue[String] = null
var directMessages: Queue[String] = null
}
You can use Json4s for this purpose (with help of FieldSerializer), below is the code to get started with serialization of the User object:
def main(args: Array[String]) {
import org.json4s._
import org.json4s.native.Serialization
import org.json4s.native.Serialization.{read, write, writePretty}
implicit val formats = DefaultFormats + FieldSerializer[User]()
val user = new User(12)
val json = write(user)
println(writePretty(user))
}
Also, in your non case class anything which is missing from the JSON needs to be an Option.
Another method would be to go for Genson:
def main(args: Array[String]) {
import com.owlike.genson._
import com.owlike.genson.ext.json4s._
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.JsonAST._
object CustomGenson {
val genson = new ScalaGenson(
new GensonBuilder()
.withBundle(ScalaBundle(), Json4SBundle())
.create()
)
}
// then just import it in the places you want to use this instance instead of the default one
import CustomGenson.genson._
val user = new User(12)
val jsonArray = toJson(user)
println(jsonArray)
}