Sorting a DStream and taking topN - scala

I have some DStream in Spark Scala and I want to sort it then take the top N.
The problem is that whenever I try to run it I get NotSerializableException and the exception message says:
This is because the DStream object is being referred to from within the closure.
The problem is that I don't know how to solve it:
Here is my try:
package com.badrit.realtime
import java.util.Date
import com.badrit.drivers.UnlimitedSpaceTimeDriver
import com.badrit.model.{CellBuilder, DataReader, Trip}
import com.badrit.utility.Printer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Duration, Milliseconds, StreamingContext}
import scala.collection.mutable
object StreamingDriver {
val appName: String = "HotSpotRealTime"
val hostName = "localhost"
val port = 5050
val constrains = UnlimitedSpaceTimeDriver.constrains;
var streamingRate = 1;
var windowSize = 8;
var slidingInterval = 2;
val cellBuilder = new CellBuilder(constrains)
val inputFilePath = "/home/ahmedelgamal/Downloads/green_tripdata_2015-02.csv"
def prepareTestData(sparkStreamCtx: StreamingContext): InputDStream[Trip] = {
val sparkCtx = sparkStreamCtx.sparkContext
val textFile: RDD[String] = sparkCtx.textFile(inputFilePath)
val data: RDD[Trip] = new DataReader().getTrips(textFile)
val groupedData = data.filter(_.pickup.date.before(new Date(2015, 1, 2, 0, 0, 0)))
.groupBy(trip => trip.pickup.date.getMinutes).sortBy(_._1).map(_._2).collect()
printf("Grouped Data Count is " + groupedData.length)
var dataQueue: mutable.Queue[RDD[Trip]] = mutable.Queue.empty;
groupedData.foreach(trips => dataQueue += sparkCtx.makeRDD(trips.toArray))
printf("\n\nTest Queue size is " + dataQueue.size)
groupedData.zipWithIndex.foreach { case (trips: Iterable[Trip], index: Int) => {
println("Items List " + index)
val passengers: Array[Int] = trips.map(_.passengers).toArray
val cnt = passengers.length
println("Sum is " + passengers.sum)
println("Cnt is " + cnt)
val passengersRdd = sparkCtx.parallelize(passengers)
println("Mean " + passengersRdd.mean())
println("Stdv" + passengersRdd.stdev())
}
}
sparkStreamCtx.queueStream(dataQueue, true)
}
def cellCreator(trip: Trip) = cellBuilder.cellForCarStop(trip.pickup)
def main(args: Array[String]) {
if (args.length < 1) {
streamingRate = 1;
windowSize = 3 //2 hours 60 * 60 * 1000L
slidingInterval = 2 //0.5 hour 60 * 60 * 1000L
}
else {
streamingRate = args(0).toInt;
windowSize = args(1).toInt
slidingInterval = args(2).toInt
}
val sparkConf = new SparkConf().setAppName(appName).setMaster("local[*]")
val sparkStreamCtx = new StreamingContext(sparkConf, Milliseconds(streamingRate))
sparkStreamCtx.sparkContext.setLogLevel("ERROR")
sparkStreamCtx.checkpoint("/tmp")
val data: InputDStream[Trip] = prepareTestData(sparkStreamCtx)
val dataWindow = data.window(new Duration(windowSize), new Duration(slidingInterval))
//my main problem lies in the following line
val newDataWindow = dataWindow.transform(rdd => sparkStreamCtx.sparkContext.parallelize(rdd.take(10)))
newDataWindow.print
sparkStreamCtx.start()
sparkStreamCtx.awaitTerminationOrTimeout(1000)
}
}
I don't mind any other ways to sort a DStream and get its top N rather than my way.

You can use transform method in the DStream object then sort the input RDD and take n elements of it in a list, then filter the original RDD to be contained in this list.
val n = 10
val topN = result.transform(rdd =>{
val list = rdd.sortBy(_._1).take(n)
rdd.filter(list.contains)
})
topN.print

Related

How to push datastream to kafka topic by retaining the order using string method in Flink Kafka Problem

I am trying to create a JSON dataset every 500 ms and want to push it to the Kafka topic so that I can set up some windows in the downstream and perform computations. Below is my code:
package KafkaAsSource
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.datastream.DataStream
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaProducer}
import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
import java.time.format.DateTimeFormatter
import java.time.LocalDateTime
import java.util.{Optional, Properties}
object PushingDataToKafka {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setMaxParallelism(256)
env.enableCheckpointing(5000)
val stream: DataStream[String] = env.fromElements(createData())
stream.addSink(sendToTopic(stream))
}
def getProperties(): Properties = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
return properties
}
def createData(): String = {
val minRange: Int = 0
val maxRange: Int = 1000
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
println(jsonData)
Thread.sleep(500)
}
return jsonData
}
def sendToTopic(): Properties = {
val producer = new FlinkKafkaProducer[String](
"topic"
,
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema())
,
getProperties(),
FlinkKafkaProducer.Semantic.EXACTLY_ONCE
)
return producer
}
}
It gives me below error:
type mismatch;
found : Any
required: org.apache.flink.streaming.api.functions.sink.SinkFunction[String]
stream.addSink(sendToTopic())
Modified Code:
object FlinkTest {
def main(ars: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment()
env.setMaxParallelism(256)
var stream = env.fromElements("")
//env.enableCheckpointing(5000)
//val stream: DataStream[String] = env.fromElements("hey mc", "1")
val myProducer = new FlinkKafkaProducer[String](
"maddy", // target topic
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), // serialization schema
getProperties(), // producer config
FlinkKafkaProducer.Semantic.EXACTLY_ONCE)
val minRange: Int = 0
val maxRange: Int = 10
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
println(a)
Thread.sleep(500)
stream = env.fromElements(jsonData)
println(jsonData)
stream.addSink(myProducer)
}
env.execute("hey")
}
def getProperties(): Properties = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
return properties
}
/*
def createData(): String = {
val minRange: Int = 0
val maxRange: Int = 10
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
Thread.sleep(500)
}
return jsonData
}
*/
}
Modified Code gives me the data in the Kafka topic but it doesn't retain the order. What am I doing wrong here in the loops? Also, had to change the version of Flink to 1.12.2 from 1.13.5.
I was initially using Flink 1.13.5, Connectors and Scala of 2.11. What exactly I am missing over here?
A couple of things about this loop:
for (a <- minRange to maxRange) {
jsonData =
"{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\""
+ DateTimeFormatter
.ofPattern("yyyy-MM-dd HH:mm:ss.SSS")
.format(LocalDateTime.now) + "\"\n \n}"
println(a)
Thread.sleep(500)
stream = env.fromElements(jsonData)
println(jsonData)
stream.addSink(myProducer)
}
The sleep is happening in the Flink client, and only affects how long it takes the client to assemble the job graph before submitting it to the cluster. It has no effect on how the job runs.
This loop is creating 10 separate pipelines that will run independently, in parallel, all producing to the same Kafka topic. Those pipelines are going to race against each other.
To get the behavior you're looking for (a global ordering across a single pipeline) you'll want to produce all of the events from a single source (in order, of course), and run the job with a parallelism of one. Something like this would do it:
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
object FlinkTest {
def main(ars: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment()
env.setParallelism(1)
val myProducer = ...
val jsonData = (i: Long) => ...
env.fromSequence(0, 9)
.map(i => jsonData(i))
.addSink(myProducer)
env.execute()
}
}
You can leave maxParallelism at 256 (or at its default value of 128); it's not particularly relevant here. The maxParallelism is the number of hash buckets that keyBy will hash the keys into, and it defines an upper limit on the scalability of the job.

How to write RDD[List[(ImmutableBytesWritable, Put)] to HBase using saveAsNewAPIHadoopDataset

I know that we can use saveAsNewAPIHadoopDataset with RDD[(ImmutableBytesWritable, Put)] to write to HBase table using spark.
But I have a list i.e RDD[List[(ImmutableBytesWritable, Put)] which I want to write 2 different HBase Tables.
How to do it?
Below is the code.
package com.scryAnalytics.FeatureExtractionController
import com.scryAnalytics.FeatureExtractionController.DAO.{DocumentEntitiesDAO, NLPEntitiesDAO, SegmentFeaturesDAO}
import com.scryAnalytics.NLPGeneric.{GateGenericNLP, NLPEntities}
import com.sun.xml.bind.v2.TODO
import com.vocp.ner.main.GateNERImpl
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapreduce.Job
import com.scryAnalytics.FeatureExtraction.SegmentsFeatureExtraction
import com.scryAnalytics.FeatureExtraction.DAO.VOCPEntities
import scala.collection.JavaConversions._
import gate.FeatureMap
import java.util.Map.Entry
import scala.collection.JavaConversions
import scala.util.control.Breaks.break
import scala.util.control.ControlThrowable
/**
* Created by sahil on 1/12/16.
*/
object Main {
def main(args: Array[String]): Unit = {
val inputTableName = "posts"
val outputTableName = "drugSegmentNew1"
val pluginHome = "/home/sahil/Voice-of-Cancer-Patients/VOCP Modules/bin/plugins"
val sc = new SparkContext(new SparkConf().setAppName("HBaseRead").setMaster("local[4]"))
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "localhost")
conf.set(TableInputFormat.INPUT_TABLE, inputTableName)
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(inputTableName)) {
val tableDesc = new HTableDescriptor(TableName.valueOf(inputTableName))
admin.createTable(tableDesc)
}
val job: Job = Job.getInstance(conf, "FeatureExtractionJob")
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable], classOf[Result])
val resultRDD = hBaseRDD.map(x => x._2)
// TODO: Add filters
val entity: VOCPEntities = VOCPEntities.DRUG
val nlpRDD = resultRDD.mapPartitions { iter =>
val nlpEntities: NLPEntitiesDAO = new NLPEntitiesDAO
iter.map {
result =>
val message = Bytes.toString(result.getValue(Bytes.toBytes("p"), Bytes.toBytes("message")))
val row_key = Bytes.toString(result.getRow)
nlpEntities.setToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("token")))))
nlpEntities.setSpaceToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("spaceToken")))))
nlpEntities.setSentence(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("sentence")))))
nlpEntities.setVG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("verbGroup")))))
nlpEntities.setSplit(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("split")))))
nlpEntities.setNounChunk(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("nounChunk")))))
nlpEntities.setDrugs(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("drug")))))
nlpEntities.setRegimen(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("regimen")))))
nlpEntities.setSideEffects(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("sideEffect")))))
nlpEntities.setALT_DRUG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altDrug")))))
nlpEntities.setALT_THERAPY(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altTherapy")))))
(row_key, message, nlpEntities)
}
}
val featureExtractionOld: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
pluginHome, entity)
val outputRDD = nlpRDD.mapPartitions { iter =>
val featureExtraction: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
pluginHome, entity)
iter.map { x =>
val featuresJson = featureExtraction.generateFeatures(x._2, Utility.objectToJson(x._3))
val segmentFeatures: SegmentFeaturesDAO = Utility.jsonToSegmentFeatures(featuresJson)
val documentEntities: DocumentEntitiesDAO = new DocumentEntitiesDAO
documentEntities.setSystemId(x._1)
documentEntities.setToken(x._3.getToken)
documentEntities.setSpaceToken(x._3.getSpaceToken)
documentEntities.setSentence(x._3.getSentence)
documentEntities.setVG(x._3.getVG)
documentEntities.setNounChunk(x._3.getNounChunk)
documentEntities.setSplit(x._3.getSplit)
documentEntities.setDRUG(x._3.getDrugs)
documentEntities.setSE(x._3.getSideEffects)
documentEntities.setREG(x._3.getRegimen)
documentEntities.setALT_DRUG(x._3.getALT_DRUG)
documentEntities.setALT_THERAPY(x._3.getALT_THERAPY)
documentEntities.setSegment(segmentFeatures.getSegment)
documentEntities.setSegmentClass(segmentFeatures.getSegmentClass)
documentEntities.setSegmentInstance(segmentFeatures.getSegmentInstance)
(x._1, documentEntities)
}
}
val newRDD = outputRDD.map { k => convertToPut(k) }
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
def convertToPut(NlpWithRowKey: (String, DocumentEntitiesDAO)): List[(ImmutableBytesWritable, Put)] = {
val rowkey = NlpWithRowKey._1
val documentEntities = NlpWithRowKey._2
var returnList: List[(ImmutableBytesWritable, Put)] = List()
val segmentInstances = documentEntities.getSegmentInstance
val segments = documentEntities.getSegment
if(segments != null) {
var count = 0
for(segment <- segmentInstances) {
val keyString: String = documentEntities.getSystemId + "#" + Integer.toString(count)
count = count + 1
val outputKey: ImmutableBytesWritable = new ImmutableBytesWritable(keyString.getBytes())
val put = new Put(outputKey.get())
val features: FeatureMap = segment.getFeatures
val it: Iterator[Entry[Object, Object]] = features.entrySet.iterator()
var sideEffect_offset = "NULL"
var entity_offset = "NULL"
while(it.hasNext) {
val pair = it.next()
if(pair.getKey.equals("sideEffect-offset")) {
sideEffect_offset = pair.getValue().toString()
}
else if(pair.getKey.equals("drug-offset")) {
entity_offset = pair.getValue().toString()
}
else if(pair.getKey().equals("drug") || pair.getKey().equals("sideEffect")){
put.add(Bytes.toBytes("seg"), Bytes.toBytes(pair.getKey.toString), Bytes
.toBytes(pair.getValue().toString))
}
else {
put.add(Bytes.toBytes("segFeatures"), Bytes.toBytes(pair.getKey.toString), Bytes
.toBytes(pair.getValue().toString))
}
}
put.add(Bytes.toBytes("seg"), Bytes.toBytes("RelationId"),
Bytes.toBytes(documentEntities.getSystemId() + "-" + entity_offset + "-" + sideEffect_offset))
put.add(Bytes.toBytes("segInst"),Bytes.toBytes("id"), Bytes.toBytes(segment.getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("type"), Bytes.toBytes(segment.getType()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeId"), Bytes.toBytes(
segment.getStartNode().getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeOffset"),
Bytes.toBytes(segment.getStartNode().getOffset()))
put.add(Bytes.toBytes("segInst"),Bytes.toBytes("endNodeId"),
Bytes.toBytes(segment.getEndNode().getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("endNodeOffset"),
Bytes.toBytes(segment.getEndNode().getOffset()))
put.add(Bytes.toBytes("seg"),Bytes.toBytes("system_id"),
Bytes.toBytes(documentEntities.getSystemId()))
put.add(Bytes.toBytes("seg"), Bytes.toBytes("segmentText"),
Bytes.toBytes(segment.getAnnotatedText()))
for(segmentClassAnnots <- documentEntities.getSegmentClass) {
try {
if (segment.getId().equals(segmentClassAnnots.getFeatures().get("instance-id"))) {
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("id"),
Bytes.toBytes(segmentClassAnnots.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("type"),
Bytes.toBytes(segmentClassAnnots.getType()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeId"), Bytes
.toBytes(segmentClassAnnots.getStartNode()
.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeOffset"), Bytes
.toBytes(segmentClassAnnots.getStartNode()
.getOffset()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeId"), Bytes
.toBytes(segmentClassAnnots.getEndNode()
.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeOffset"), Bytes
.toBytes(segmentClassAnnots.getEndNode()
.getOffset()))
break
}
} catch {
case t: Throwable => t.printStackTrace
}
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("drugSegmentNew1")), put))
}
}
}
val PUT = new Put(Bytes.toBytes(rowkey))
PUT.add(Bytes.toBytes("f"), Bytes.toBytes("dStatus"), Bytes.toBytes("1"))
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("posts")), PUT))
(returnList)
}
}
Just change your below line :
val newRDD = outputRDD.map { k => convertToPut(k) }
with this line:
val newRDD = outputRDD.flatMap { k => convertToPut(k) }
Hope this helps!

Implementing topological sort in Spark GraphX

I am trying to implement topological sort using Spark's GraphX library.
This is the code I've written so far:
MyObject.scala
import java.util.ArrayList
import scala.collection.mutable.Queue
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.EdgeDirection
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
object MyObject {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Spark-App").setMaster("local[2]")
val sc = new SparkContext(conf)
val resources: RDD[Resource] = makeResources(sc)
val relations: RDD[Relation] = makeRelations(sc)
println("Building graph ...")
var graph = buildGraph(resources, relations, sc)
println("Graph built!!")
println("Testing topo sort ...")
val topoSortResult = topoSort(graph, sc);
println("topoSortResult = " + topoSortResult)
println("Testing topo sort done!")
}
def buildGraph(resources: RDD[Resource], relations: RDD[Relation], sc: SparkContext): Graph[Resource, Relation] =
{
val vertices: RDD[(Long, Resource)] = resources.map(resource => (resource.id, resource))
val edges: RDD[Edge[Relation]] = relations.map(relation => Edge(relation.srcId, relation.dstId, relation))
var graph = Graph[Resource, Relation](vertices, edges)
graph
}
def makeResources(sc: SparkContext): RDD[Resource] =
{
var list: List[Resource] = List()
list = list :+ new Resource(1L)
list = list :+ new Resource(2L)
list = list :+ new Resource(3L)
list = list :+ new Resource(4L)
list = list :+ new Resource(5L)
sc.parallelize(list)
}
def makeRelations(sc: SparkContext): RDD[Relation] =
{
var list: List[Relation] = List()
list = list :+ new Relation(1L, "depends_on", 2L)
list = list :+ new Relation(3L, "depends_on", 2L)
list = list :+ new Relation(4L, "depends_on", 2L)
list = list :+ new Relation(5L, "depends_on", 2L)
sc.parallelize(list)
}
def topoSort(graph: Graph[Resource, Relation], sc: SparkContext): java.util.List[(VertexId, Resource)] =
{
// Will contain the result
val sortedResources: java.util.List[(VertexId, Resource)] = new ArrayList()
// Contains all the vertices
val vertices = graph.vertices
// Contains all the vertices whose in-degree > 0
val inDegrees = graph.inDegrees;
val inDegreesKeys_array = inDegrees.keys.collect();
// Contains all the vertices whose in-degree == 0
val inDegreeZeroList = vertices.filter(vertex => !inDegreesKeys_array.contains(vertex._1))
// A map of vertexID vs its in-degree
val inDegreeMapRDD = inDegreeZeroList.map(vertex => (vertex._1, 0)).union(inDegrees);
// Insert all the resources whose in-degree == 0 into a queue
val queue = new Queue[(VertexId, Resource)]
for (vertex <- inDegreeZeroList.toLocalIterator) { queue.enqueue(vertex) }
// Get an RDD containing the outgoing edges of every vertex
val neighbours = graph.collectNeighbors(EdgeDirection.Out)
// Initiate the algorithm
while (!queue.isEmpty) {
val vertex_top = queue.dequeue()
// Add the topmost element of the queue to the result
sortedResources.add(vertex_top)
// Get the neigbours (from outgoing edges) of this vertex
// This will be an RDD containing just 1 element which will be an array of neighbour vertices
val vertex_neighbours = neighbours.filter(vertex => vertex._1.equals(vertex_top._1))
// For each vertex, decrease its in-degree by 1
vertex_neighbours.foreach(arr => {
val neighbour_array = arr._2
neighbour_array.foreach(vertex => {
val oldInDegree = inDegreeMapRDD.filter(vertex_iter => (vertex_iter._1 == vertex._1)).first()._2
val newInDegree = oldInDegree - 1
// Reflect the new in-degree in the in-degree map RDD
inDegreeMapRDD.map(vertex_iter => {
if (vertex_iter._1 == vertex._1) {
(vertex._1, newInDegree)
}
else{
vertex_iter
}
});
// Add this vertex to the result if its in-degree has become zero
if (newInDegree == 0) {
queue.enqueue(vertex)
}
})
})
}
return sortedResources
}
}
Resource.scala
class Resource(val id: Long) extends Serializable {
override def toString(): String = {
"id = " + id
}
}
Relation.scala
class Relation(val srcId: Long, val name: String, val dstId: Long) extends Serializable {
override def toString(): String = {
srcId + " " + name + " " + dstId
}
}
I am getting the error :
org.apache.spark.SparkException: RDD transformations and actions can only be invoked by the driver, not inside of other transformations; for example, rdd1.map(x => rdd2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the rdd1.map transformation. For more information, see SPARK-5063.
for the line val oldInDegree = inDegreeMapRDD.filter(vertex_iter => (vertex_iter._1 == vertex._1)).first()._2.
I guess this is because it is illegal to modify an RDD inside the for-each loop of some other RDD.
Also, I fear that queue.enqueue(vertex) will not work, since it is not possible to modify a local collection inside a for-each loop.
How do I correctly implement this topological sort algorithm ?
The full stack trace of the exception is uploaded here (Had to upload it externally to prevent exceeding the body size limit of StackOverflow).
vertex_neighbours.foreach(arr => {
val neighbour_array = arr._2
neighbour_array.foreach(vertex => {
. . .
The outer foreach could be replaced by a for loop.
val vertex_neighbours = neighbours.filter(vertex => vertex._1.equals(vertex_top._1)).collect()
You need to get the RDD before doing for loop over it.

Scala: Save result in a toDf temp table

I'm trying save some analyses in toDF TempTable, but a receive the following error ":215: error: value toDF is not a member of Double".
I'm reading data of a Cassandra table, and i'm doing some calculations. I want save these results in a temp table.
I'm new in scala, somebody, can help me please?
my code
case class Consumo(consumo:Double, consumo_mensal: Double, mes: org.joda.time.DateTime,ano: org.joda.time.DateTime, soma_pf: Double,empo_gasto: Double);
object Analysegridata{
val conf = new SparkConf(true)
.set("spark.cassandra.connection.host","127.0.0.1").setAppName("LiniarRegression")
.set("spark.cassandra.connection.port", "9042")
.set("spark.driver.allowMultipleContexts", "true")
.set("spark.streaming.receiver.writeAheadLog.enable", "true");
val sc = new SparkContext(conf);
val ssc = new StreamingContext(sc, Seconds(1))
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val checkpointDirectory = "/var/lib/cassandra/data"
ssc.checkpoint(checkpointDirectory) // set checkpoint directory
// val context = StreamingContext.getOrCreate(checkpointDirectory)
import sqlContext.implicits._
JavaSparkContext.fromSparkContext(sc);
def rddconsumo(rddData: Double): Double = {
val rddData: Double = {
implicit val data = conf
val grid = sc.cassandraTable("smartgrids", "analyzer").as((r:Double) => (r)).collect
def goto(cs: Array[Double]): Double = {
var consumo = 0.0;
var totaldias = 0;
var soma_pf = 0.0;
var somamc = 0.0;
var tempo_gasto = 0.0;
var consumo_mensal = 0.0;
var i=0
for (i <- 0 until grid.length) {
val minutos = sc.cassandraTable("smartgrids","analyzer_temp").select("timecol", "MINUTE");
val horas = sc.cassandraTable("smartgrids","analyzer_temp").select("timecol","HOUR_OF_DAY");
val dia = sc.cassandraTable("smartgrids","analyzer_temp").select("timecol", "DAY_OF_MONTH");
val ano = sc.cassandraTable("smartgrids","analyzer_temp").select("timecol", "YEAR");
val mes = sc.cassandraTable("smartgrids","analyzer_temp").select("timecol", "MONTH");
val potencia = sc.cassandraTable("smartgrids","analyzer_temp").select("n_pf1", "n_pf2", "n_pf3")
def convert_minutos (minuto : Int) : Double ={
minuto/60
}
dia.foreach (i => {
def adSum(potencia: Array[Double]) = {
var i=0;
while (i < potencia.length) {
soma_pf += potencia(i);
i += 1;
soma_pf;
println("Potemcia =" + soma_pf)
}
}
def tempo(minutos: Array[Int]) = {
var i=0;
while (i < minutos.length) {
somamc += convert_minutos(minutos(i))
i += 1;
somamc
}
}
def tempogasto(horas: Array[Int]) = {
var i=0;
while (i < horas.length) {
tempo_gasto = horas(i) + somamc;
i += 1;
tempo_gasto;
println("Temo que o aparelho esteve ligado =" + tempo_gasto)
}
}
def consumof(dia: Array[Int]) = {
var i=0;
while (i < dia.length) {
consumo = soma_pf * tempo_gasto;
i += 1;
consumo;
println("Consumo diario =" + consumo)
}
}
})
mes.foreach (i => {
def totaltempo(dia: Array[Int]) = {
var i = 0;
while(i < dia.length){
totaldias += dia(i);
i += 1;
totaldias;
println("Numero total de dias =" + totaldias)
}
}
def consumomensal(mes: Array[Int]) = {
var i = 0;
while(i < mes.length){
consumo_mensal = consumo * totaldias;
i += 1;
consumo_mensal;
println("Consumo Mensal =" + consumo_mensal);
}
}
})
}
consumo;
totaldias;
consumo_mensal;
soma_pf;
tempo_gasto;
somamc
}
rddData
}
rddData.toDF().registerTempTable("rddData")
}
ssc.start()
ssc.awaitTermination()
error: value toDF is not a member of Double"
It's rather unclear what you're trying to do exactly (too much code, try providing a minimal example), but there are a few apparent issues:
rddData has type Double: seems like it should be RDD[Double] (which is a distributed collection of Double values). Trying to save a single Double value as a table doesn't make much sense, and indeed - doesn't work (toDF can be called on an RDD, not any type, specifically not on Double, as the compiler warns).
you collect() the data: if you want to load an RDD, transform it using some manipulation, and then save it as a table - collect() should probably not be called on that RDD. collect() sends all the data (distributed across the cluster) into the single "driver" machine (the one running this code) - after which you're not taking advantage of the cluster, and again not using the RDD data structure so you can't convert the data into a DataFrame using toDF.

Spark job returns a different result on each run

I am working on a scala code which performs Linear Regression on certain datasets. Right now I am using 20 cores and 25 executors and everytime I run a Spark job I get a different result.
The input size of the files are 2GB and 400 MB.However, when I run the job with 20 cores and 1 executor, I get consistent results.
Has anyone experienced such a thing so far?
Please find the code below:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SchemaRDD
import org.apache.spark.Partitioner
import org.apache.spark.storage.StorageLevel
object TextProcess{
def main(args: Array[String]){
val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val numExecutors=(conf.get("spark.executor.instances").toInt)
// Read the 2 input files
// First file is either cases / controls
val input1 = sc.textFile(args(0))
// Second file is Gene Expression
val input2 = sc.textFile(args(1))
//collecting header information
val header1=sc.parallelize(input1.take(1))
val header2=sc.parallelize(input2.take(1))
//mapping data without the header information
val map1 = input1.subtract(header1).map(x => (x.split(" ")(0)+x.split(" ")(1), x))
val map2 = input2.subtract(header2).map(x => (x.split(" ")(0)+x.split(" ")(1), x))
//joining data. here is where the order was getting affected.
val joinedMap = map1.join(map2)
//adding the header back to the top of RDD
val x = header1.union(joinedMap.map{case(x,(y,z))=>y})
val y = header2.union(joinedMap.map{case(x,(y,z))=>z})
//removing irrelevant columns
val rddX = x.map(x=>x.split(" ").drop(3)).zipWithIndex.map{case(a,b)=> a.map(x=>b.toString+" "+x.toString)}
val rddY = y.map(x=>x.split(" ").drop(2)).zipWithIndex.map{case(a,b)=> a.map(x=>b.toString+" "+x.toString)}
//transposing and cross joining data. This keeps the identifier at the start
val transposedX = rddX.flatMap(x => x.zipWithIndex.map(x=>x.swap)).reduceByKey((a,b)=> a+":"+b).map{case(a,b)=>b.split(":").sorted}
val transposedY = rddY.flatMap(x => x.zipWithIndex.map(x=>x.swap)).reduceByKey((a,b)=> a+":"+b).map{case(a,b)=>b.split(":").sorted}.persist(StorageLevel.apply(false, true, false, false, numExecutors))
val cleanedX = transposedX.map(x=>x.map(x=>x.slice(x.indexOfSlice(" ")+1,x.length)))
val cleanedY = transposedY.map(x=>x.map(x=>x.slice(x.indexOfSlice(" ")+1,x.length))).persist(StorageLevel.apply(false, true, false, false, numExecutors))
val cartXY = cleanedX.cartesian(cleanedY)
val finalDataSet= cartXY.map{case(a,b)=>a zip b}
//convert to key value pair
val regressiondataset = finalDataSet.map(x=>(x(0),x.drop(1).filter{case(a,b)=> a!="NA" && b!="NA" && a!="null" && b!="null"}.map{case(a,b)=> (a.toDouble, b.toDouble)}))
val linearOutput = regressiondataset.map(s => new LinearRegression(s._1 ,s._2).outputVal)
linearOutput.saveAsTextFile(args(2))
cleanedY.unpersist()
transposedY.unpersist()
}
}
class LinearRegression (val keys: (String, String),val pairs: Array[(Double,Double)]) {
val size = pairs.size
// first pass: read in data, compute xbar and ybar
val sums = pairs.aggregate(new X_X2_Y(0D,0D,0D))(_ + new X_X2_Y(_),_+_)
val bars = (sums.x / size, sums.y / size)
// second pass: compute summary statistics
val sumstats = pairs.foldLeft(new X2_Y2_XY(0D,0D,0D))(_ + new X2_Y2_XY(_, bars))
val beta1 = sumstats.xy / sumstats.x2
val beta0 = bars._2 - (beta1 * bars._1)
val betas = (beta0, beta1)
//println("y = " + ("%4.3f" format beta1) + " * x + " + ("%4.3f" format beta0))
// analyze results
val correlation = pairs.aggregate(new RSS_SSR(0D,0D))(_ + RSS_SSR.build(_, bars, betas),_+_)
val R2 = correlation.ssr / sumstats.y2
val svar = correlation.rss / (size - 2)
val svar1 = svar / sumstats.x2
val svar0 = ( svar / size ) + ( bars._1 * bars._1 * svar1)
val svar0bis = svar * sums.x2 / (size * sumstats.x2)
/* println("R^2 = " + R2)
println("std error of beta_1 = " + Math.sqrt(svar1))
println("std error of beta_0 = " + Math.sqrt(svar0))
println("std error of beta_0 = " + Math.sqrt(svar0bis))
println("SSTO = " + sumstats.y2)
println("SSE = " + correlation.rss)
println("SSR = " + correlation.ssr)*/
def outputVal() = keys._1
+"\t"+keys._2
+"\t"+beta1
+"\t"+beta0
+"\t"+R2
+"\t"+Math.sqrt(svar1)
+"\t"+Math.sqrt(svar0)
+"\t"+sumstats.y2
+"\t"+correlation.rss
+"\t"+correlation.ssr+"\t;
}
object RSS_SSR {
def build(p: (Double,Double), bars: (Double,Double), betas: (Double,Double)): RSS_SSR = {
val fit = (betas._2 * p._1) + betas._1
val rss = (fit-p._2) * (fit-p._2)
val ssr = (fit-bars._2) * (fit-bars._2)
new RSS_SSR(rss, ssr)
}
}
class RSS_SSR(val rss: Double, val ssr: Double) {
def +(p: RSS_SSR): RSS_SSR = new RSS_SSR(rss+p.rss, ssr+p.ssr)
}
class X_X2_Y(val x: Double, val x2: Double, val y: Double) {
def this(p: (Double,Double)) = this(p._1, p._1*p._1, p._2)
def +(p: X_X2_Y): X_X2_Y = new X_X2_Y(x+p.x,x2+p.x2,y+p.y)
}
class X2_Y2_XY(val x2: Double, val y2: Double, val xy: Double) {
def this(p: (Double,Double), bars: (Double,Double)) = this((p._1-bars._1)*(p._1-bars._1), (p._2-bars._2)*(p._2-bars._2),(p._1-bars._1)*(p._2-bars._2))
def +(p: X2_Y2_XY): X2_Y2_XY = new X2_Y2_XY(x2+p.x2,y2+p.y2,xy+p.xy)
}