Implementing topological sort in Spark GraphX - scala

I am trying to implement topological sort using Spark's GraphX library.
This is the code I've written so far:
MyObject.scala
import java.util.ArrayList
import scala.collection.mutable.Queue
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.EdgeDirection
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
object MyObject {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Spark-App").setMaster("local[2]")
val sc = new SparkContext(conf)
val resources: RDD[Resource] = makeResources(sc)
val relations: RDD[Relation] = makeRelations(sc)
println("Building graph ...")
var graph = buildGraph(resources, relations, sc)
println("Graph built!!")
println("Testing topo sort ...")
val topoSortResult = topoSort(graph, sc);
println("topoSortResult = " + topoSortResult)
println("Testing topo sort done!")
}
def buildGraph(resources: RDD[Resource], relations: RDD[Relation], sc: SparkContext): Graph[Resource, Relation] =
{
val vertices: RDD[(Long, Resource)] = resources.map(resource => (resource.id, resource))
val edges: RDD[Edge[Relation]] = relations.map(relation => Edge(relation.srcId, relation.dstId, relation))
var graph = Graph[Resource, Relation](vertices, edges)
graph
}
def makeResources(sc: SparkContext): RDD[Resource] =
{
var list: List[Resource] = List()
list = list :+ new Resource(1L)
list = list :+ new Resource(2L)
list = list :+ new Resource(3L)
list = list :+ new Resource(4L)
list = list :+ new Resource(5L)
sc.parallelize(list)
}
def makeRelations(sc: SparkContext): RDD[Relation] =
{
var list: List[Relation] = List()
list = list :+ new Relation(1L, "depends_on", 2L)
list = list :+ new Relation(3L, "depends_on", 2L)
list = list :+ new Relation(4L, "depends_on", 2L)
list = list :+ new Relation(5L, "depends_on", 2L)
sc.parallelize(list)
}
def topoSort(graph: Graph[Resource, Relation], sc: SparkContext): java.util.List[(VertexId, Resource)] =
{
// Will contain the result
val sortedResources: java.util.List[(VertexId, Resource)] = new ArrayList()
// Contains all the vertices
val vertices = graph.vertices
// Contains all the vertices whose in-degree > 0
val inDegrees = graph.inDegrees;
val inDegreesKeys_array = inDegrees.keys.collect();
// Contains all the vertices whose in-degree == 0
val inDegreeZeroList = vertices.filter(vertex => !inDegreesKeys_array.contains(vertex._1))
// A map of vertexID vs its in-degree
val inDegreeMapRDD = inDegreeZeroList.map(vertex => (vertex._1, 0)).union(inDegrees);
// Insert all the resources whose in-degree == 0 into a queue
val queue = new Queue[(VertexId, Resource)]
for (vertex <- inDegreeZeroList.toLocalIterator) { queue.enqueue(vertex) }
// Get an RDD containing the outgoing edges of every vertex
val neighbours = graph.collectNeighbors(EdgeDirection.Out)
// Initiate the algorithm
while (!queue.isEmpty) {
val vertex_top = queue.dequeue()
// Add the topmost element of the queue to the result
sortedResources.add(vertex_top)
// Get the neigbours (from outgoing edges) of this vertex
// This will be an RDD containing just 1 element which will be an array of neighbour vertices
val vertex_neighbours = neighbours.filter(vertex => vertex._1.equals(vertex_top._1))
// For each vertex, decrease its in-degree by 1
vertex_neighbours.foreach(arr => {
val neighbour_array = arr._2
neighbour_array.foreach(vertex => {
val oldInDegree = inDegreeMapRDD.filter(vertex_iter => (vertex_iter._1 == vertex._1)).first()._2
val newInDegree = oldInDegree - 1
// Reflect the new in-degree in the in-degree map RDD
inDegreeMapRDD.map(vertex_iter => {
if (vertex_iter._1 == vertex._1) {
(vertex._1, newInDegree)
}
else{
vertex_iter
}
});
// Add this vertex to the result if its in-degree has become zero
if (newInDegree == 0) {
queue.enqueue(vertex)
}
})
})
}
return sortedResources
}
}
Resource.scala
class Resource(val id: Long) extends Serializable {
override def toString(): String = {
"id = " + id
}
}
Relation.scala
class Relation(val srcId: Long, val name: String, val dstId: Long) extends Serializable {
override def toString(): String = {
srcId + " " + name + " " + dstId
}
}
I am getting the error :
org.apache.spark.SparkException: RDD transformations and actions can only be invoked by the driver, not inside of other transformations; for example, rdd1.map(x => rdd2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the rdd1.map transformation. For more information, see SPARK-5063.
for the line val oldInDegree = inDegreeMapRDD.filter(vertex_iter => (vertex_iter._1 == vertex._1)).first()._2.
I guess this is because it is illegal to modify an RDD inside the for-each loop of some other RDD.
Also, I fear that queue.enqueue(vertex) will not work, since it is not possible to modify a local collection inside a for-each loop.
How do I correctly implement this topological sort algorithm ?
The full stack trace of the exception is uploaded here (Had to upload it externally to prevent exceeding the body size limit of StackOverflow).

vertex_neighbours.foreach(arr => {
val neighbour_array = arr._2
neighbour_array.foreach(vertex => {
. . .
The outer foreach could be replaced by a for loop.

val vertex_neighbours = neighbours.filter(vertex => vertex._1.equals(vertex_top._1)).collect()
You need to get the RDD before doing for loop over it.

Related

How to implement Levenshtein Distance Algorithm in Scala

I've a text file which contains the information about the sender and messages and the format is sender,messages. I want to use Levenshtein Distance Algorithm with threshold of 70% and want to store the similar messages to the Map. In the Map, My key is String and value is List[String]
For example my requirement is: If my messages are abc, bcd, cdf.
step1: First I should add the message 'abc' to the List. map.put("Group1",abc.toList)
step2: Next, I should compare the 'bcd'(2nd message) with 'abc'(1st message). If they meets the threshold of 70% then I should add the 'bcd' to List. Now, 'abc' and 'bcd' are added under the same key called 'Group1'.
step3: Now, I should get all the elements from Map. Currently G1 only with 2 values(abc,bcd), next compare the current message 'cdf' with 'abc' or 'bcd' (As 'abc' and 'bcd' is similar comparing with any one of them would be enough)
step4: If did not meet the threshold, I should create a new key "Group2" and add that message to the List and so on.
The 70% threshold means, For example:
message1: Dear customer! your mobile number 9032412236 has been successfully recharged with INR 500.00
message2: Dear customer! your mobile number 7999610201 has been successfully recharged with INR 500.00
Here, the Levenshtein Distance between these two is 8. We can check this here: https://planetcalc.com/1721/
8 edits needs to be done, 8 characters did not match out of (message1.length+message2.length)/2
If I assume the first message is of 100 characters and second message is of 100 characters then the average length is 100, out of 100, 8 characters did not match which means the accuracy level of this is 92%, so here, I should keep threshold 70%.
If Levenshtein distance matching at least 70%, then take them as similar.
I'm using the below library:
libraryDependencies += "info.debatty" % "java-string-similarity" % "2.0.0"
My code:
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
object Demo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("My App")
val sc = new SparkContext(conf)
val inputFile = "D:\\MyData.txt"
val data = sc.textFile(inputFile)
val data2 = data.map(line => {
val arr = line.split(","); (arr(0), arr(1))
})
val grpData = data2.groupByKey()
val myMap = scala.collection.mutable.Map.empty[String, List[String]]
for (values <- grpData.values.collect) {
val list = ListBuffer[String]()
for (value <- values) {
println(values)
if (myMap.isEmpty) {
list += value
myMap.put("G1", list.toList)
} else {
val currentMsg = value
val valuePartOnly = myMap.valuesIterator.toString()
for (messages <- valuePartOnly) {
def levenshteinDistance(currentMsg: String, messages: String) = {
???//TODO: Implement distance
}
}
}
}
}
}
}
After the else part, I'm not sure how do I start with this algorithm.
I do not have any output sample. So, I've explained it step by step.
Please check from step1 to step4.
Thanks.
I'm not really certain about next code I did not tried it, but I hope it demonstrates the idea:
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
object Demo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("My App")
val distance: Levenshtein = new Levenshtein(); //Create object for calculation distance
def levenshteinDistance(left: String, right: String): Double = {
// I'm not really certain about this, how you would like to calculate relative distance?
// Relatevly to string with max size, min size, left or right?
l.distance(left, right) / Math.max(left.size, right.size)
}
val sc = new SparkContext(conf)
val inputFile = "D:\\MyData.txt"
val data = sc.textFile(inputFile)
val data2 = data.map(line => {
val arr = line.split(","); (arr(0), arr(1))
})
val grpData = data2.groupByKey()
val messages = scala.collection.mutable.Map.empty[String, List[String]]
var group = 1
for (values <- grpData.values.collect) {
val list = ListBuffer[String]()
for (value <- values) {
println(values)
if (messages.isEmpty) {
list += value
messages.put("G$group", list.toList)
} else {
val currentMsg = value
val group = messages.values.find {
case(key, messages) => messages.forall(message => levenshteinDistance(currentMsg, message) <= 0.7)
}._1.getOrElse {
group += 1
"G$group"
}
val groupMessages = messages.getOrEse(group, ListBuffer.empty[String])
groupMessages.append(currentMsg)
messages.put(currentMsg, groupMessages)
}
}
}
}
}

Scala/Spark: Converting zero inflated data in dataframe to libsvm

I am very new to scala (typically I do this in R)
I have imported a large dataframe (2000+ columns, 100000+ rows) that is zero-inflated.
Task
To convert the data to libsvm format
Steps
As I understand the steps are as follows
Ensure feature columns are set to DoubleType and Target is an Int
Iterate through each row, retaining each value >0 in one array and index of its column in another array
Convert to RDD[LabeledPoint]
Save RDD in libsvm format
I am stuck on 3 (but maybe) because I am doing step 2 wrong.
Here is my code:
Main Function:
#Test
def testSpark(): Unit =
{
try
{
var mDF: DataFrame = spark.read.option("header", "true").option("inferSchema", "true").csv("src/test/resources/knimeMergedTRimmedVariables.csv")
val mDFTyped = castAllTypedColumnsTo(mDF, IntegerType, DoubleType)
val indexer = new StringIndexer()
.setInputCol("Majors_Final")
.setOutputCol("Majors_Final_Indexed")
val mDFTypedIndexed = indexer.fit(mDFTyped).transform(mDFTyped)
val mDFFinal = castColumnTo(mDFTypedIndexed,"Majors_Final_Indexed", IntegerType)
//only doubles accepted by sparse vector, so that's what we filter for
val fieldSeq: scala.collection.Seq[StructField] = schema.fields.toSeq.filter(f => f.dataType == DoubleType)
val fieldNameSeq: Seq[String] = fieldSeq.map(f => f.name)
val labeled:DataFrame = mDFFinal.map(row => convertRowToLabeledPoint(row,fieldNameSeq,row.getAs("Majors_Final_Indexed"))).toDF()
assertTrue(true)
}
catch
{
case ex: Exception =>
{
println(s"There has been an Exception. Message is ${ex.getMessage} and ${ex}")
fail()
}
}
}
Convert each row to LabeledPoint:
#throws(classOf[Exception])
private def convertRowToLabeledPoint(rowIn: Row, fieldNameSeq: Seq[String], label:Int): LabeledPoint =
{
try
{
val values: Map[String, Double] = rowIn.getValuesMap(fieldNameSeq)
val sortedValuesMap = ListMap(values.toSeq.sortBy(_._1): _*)
val rowValuesItr: Iterable[Double] = sortedValuesMap.values
var positionsArray: ArrayBuffer[Int] = ArrayBuffer[Int]()
var valuesArray: ArrayBuffer[Double] = ArrayBuffer[Double]()
var currentPosition: Int = 0
rowValuesItr.foreach
{
kv =>
if (kv > 0)
{
valuesArray += kv;
positionsArray += currentPosition;
}
currentPosition = currentPosition + 1;
}
val lp:LabeledPoint = new LabeledPoint(label, org.apache.spark.mllib.linalg.Vectors.sparse(positionsArray.size,positionsArray.toArray, valuesArray.toArray))
return lp
}
catch
{
case ex: Exception =>
{
throw new Exception(ex)
}
}
}
Problem
So then I try to create a dataframe of labeledpoints which can easily be converted to an RDD.
val labeled:DataFrame = mDFFinal.map(row => convertRowToLabeledPoint(row,fieldNameSeq,row.getAs("Majors_Final_Indexed"))).toDF()
But I get the following error:
SparkTest.scala:285: error: Unable to find encoder for type stored in a Dataset. Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._ Support for seri
alizing other types will be added in future releases.
[INFO] val labeled:DataFrame = mDFFinal.map(row => convertRowToLabeledPoint(row,fieldNameSeq,row.getAs("Majors_Final_Indexed"))).toDF()
OK, so I skipped the DataFrame and created an Array of LabeledPoints whish is easily converted to an RDD. The rest is easy.
I stress, that while this works, I am new to scala and there may be more efficient ways to do this.
Main Function is now as follows:
val mDF: DataFrame = spark.read.option("header", "true").option("inferSchema", "true").csv("src/test/resources/knimeMergedTRimmedVariables.csv")
val mDFTyped = castAllTypedColumnsTo(mDF, IntegerType, DoubleType)
val indexer = new StringIndexer()
.setInputCol("Majors_Final")
.setOutputCol("Majors_Final_Indexed")
val mDFTypedIndexed = indexer.fit(mDFTyped).transform(mDFTyped)
val mDFFinal = castColumnTo(mDFTypedIndexed,"Majors_Final_Indexed", IntegerType)
mDFFinal.show()
//only doubles accepted by sparse vector, so that's what we filter for
val fieldSeq: scala.collection.Seq[StructField] = mDFFinal.schema.fields.toSeq.filter(f => f.dataType == DoubleType)
val fieldNameSeq: Seq[String] = fieldSeq.map(f => f.name)
var positionsArray: ArrayBuffer[LabeledPoint] = ArrayBuffer[LabeledPoint]()
mDFFinal.collect().foreach
{
row => positionsArray+=convertRowToLabeledPoint(row,fieldNameSeq,row.getAs("Majors_Final_Indexed"));
}
val mRdd:RDD[LabeledPoint]= spark.sparkContext.parallelize(positionsArray.toSeq)
MLUtils.saveAsLibSVMFile(mRdd, "./output/libsvm")

Sorting a DStream and taking topN

I have some DStream in Spark Scala and I want to sort it then take the top N.
The problem is that whenever I try to run it I get NotSerializableException and the exception message says:
This is because the DStream object is being referred to from within the closure.
The problem is that I don't know how to solve it:
Here is my try:
package com.badrit.realtime
import java.util.Date
import com.badrit.drivers.UnlimitedSpaceTimeDriver
import com.badrit.model.{CellBuilder, DataReader, Trip}
import com.badrit.utility.Printer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Duration, Milliseconds, StreamingContext}
import scala.collection.mutable
object StreamingDriver {
val appName: String = "HotSpotRealTime"
val hostName = "localhost"
val port = 5050
val constrains = UnlimitedSpaceTimeDriver.constrains;
var streamingRate = 1;
var windowSize = 8;
var slidingInterval = 2;
val cellBuilder = new CellBuilder(constrains)
val inputFilePath = "/home/ahmedelgamal/Downloads/green_tripdata_2015-02.csv"
def prepareTestData(sparkStreamCtx: StreamingContext): InputDStream[Trip] = {
val sparkCtx = sparkStreamCtx.sparkContext
val textFile: RDD[String] = sparkCtx.textFile(inputFilePath)
val data: RDD[Trip] = new DataReader().getTrips(textFile)
val groupedData = data.filter(_.pickup.date.before(new Date(2015, 1, 2, 0, 0, 0)))
.groupBy(trip => trip.pickup.date.getMinutes).sortBy(_._1).map(_._2).collect()
printf("Grouped Data Count is " + groupedData.length)
var dataQueue: mutable.Queue[RDD[Trip]] = mutable.Queue.empty;
groupedData.foreach(trips => dataQueue += sparkCtx.makeRDD(trips.toArray))
printf("\n\nTest Queue size is " + dataQueue.size)
groupedData.zipWithIndex.foreach { case (trips: Iterable[Trip], index: Int) => {
println("Items List " + index)
val passengers: Array[Int] = trips.map(_.passengers).toArray
val cnt = passengers.length
println("Sum is " + passengers.sum)
println("Cnt is " + cnt)
val passengersRdd = sparkCtx.parallelize(passengers)
println("Mean " + passengersRdd.mean())
println("Stdv" + passengersRdd.stdev())
}
}
sparkStreamCtx.queueStream(dataQueue, true)
}
def cellCreator(trip: Trip) = cellBuilder.cellForCarStop(trip.pickup)
def main(args: Array[String]) {
if (args.length < 1) {
streamingRate = 1;
windowSize = 3 //2 hours 60 * 60 * 1000L
slidingInterval = 2 //0.5 hour 60 * 60 * 1000L
}
else {
streamingRate = args(0).toInt;
windowSize = args(1).toInt
slidingInterval = args(2).toInt
}
val sparkConf = new SparkConf().setAppName(appName).setMaster("local[*]")
val sparkStreamCtx = new StreamingContext(sparkConf, Milliseconds(streamingRate))
sparkStreamCtx.sparkContext.setLogLevel("ERROR")
sparkStreamCtx.checkpoint("/tmp")
val data: InputDStream[Trip] = prepareTestData(sparkStreamCtx)
val dataWindow = data.window(new Duration(windowSize), new Duration(slidingInterval))
//my main problem lies in the following line
val newDataWindow = dataWindow.transform(rdd => sparkStreamCtx.sparkContext.parallelize(rdd.take(10)))
newDataWindow.print
sparkStreamCtx.start()
sparkStreamCtx.awaitTerminationOrTimeout(1000)
}
}
I don't mind any other ways to sort a DStream and get its top N rather than my way.
You can use transform method in the DStream object then sort the input RDD and take n elements of it in a list, then filter the original RDD to be contained in this list.
val n = 10
val topN = result.transform(rdd =>{
val list = rdd.sortBy(_._1).take(n)
rdd.filter(list.contains)
})
topN.print

Iterating over cogrouped RDD

I have used a cogroup function and obtain following RDD:
org.apache.spark.rdd.RDD[(Int, (Iterable[(Int, Long)], Iterable[(Int, Long)]))]
Before the map operation joined object would look like this:
RDD[(Int, (Iterable[(Int, Long)], Iterable[(Int, Long)]))]
(-2095842000,(CompactBuffer((1504999740,1430096464017), (613904354,1430211912709), (-1514234644,1430288363100), (-276850688,1430330412225)),CompactBuffer((-511732877,1428682217564), (1133633791,1428831320960), (1168566678,1428964645450), (-407341933,1429009306167), (-1996133514,1429016485487), (872888282,1429031501681), (-826902224,1429034491003), (818711584,1429111125268), (-1068875079,1429117498135), (301875333,1429121399450), (-1730846275,1429131773065), (1806256621,1429135583312))))
(352234000,(CompactBuffer((1350763226,1430006650167), (-330160951,1430320010314)),CompactBuffer((2113207721,1428994842593), (-483470471,1429324209560), (1803928603,1429426861915))))
Now I want to do the following:
val globalBuffer = ListBuffer[Double]()
val joined = data1.cogroup(data2).map(x => {
val listA = x._2._1.toList
val listB = x._2._2.toList
for(tupleB <- listB) {
val localResults = ListBuffer[Double]()
val itemToTest = Set(tupleB._1)
val tempList = ListBuffer[(Int, Double)]()
for(tupleA <- listA) {
val tValue = someFunctionReturnDouble(tupleB._2, tupleA._2)
val i = (tupleA._1, tValue)
tempList += i
}
val sortList = tempList.sortWith(_._2 > _._2).slice(0,20).map(i => i._1)
val intersect = sortList.toSet.intersect(itemToTest)
if (intersect.size > 0)
localResults += 1.0
else localResults += 0.0
val normalized = sum(localResults.toList)/localResults.size
globalBuffer += normalized
}
})
//method sum
def sum(xs: List[Double]): Double = {//do the sum}
At the end of this I was expecting joined to be a list with double values. But when I looked at it it was unit. Also I will this is not the Scala way of doing it. How do I obtain globalBuffer as the final result.
Hmm, if I understood your code correctly, it could benefit from these improvements:
val joined = data1.cogroup(data2).map(x => {
val listA = x._2._1.toList
val listB = x._2._2.toList
val localResults = listB.map {
case (intBValue, longBValue) =>
val itemToTest = intBValue // it's always one element
val tempList = listA.map {
case (intAValue, longAValue) =>
(intAValue, someFunctionReturnDouble(longBvalue, longAValue))
}
val sortList = tempList.sortWith(-_._2).slice(0,20).map(i => i._1)
if (sortList.toSet.contains(itemToTest)) { 1.0 } else {0.0}
// no real need to convert to a set for 20 elements, by the way
}
sum(localResults)/localResults.size
})
Transformations of RDDs are not going to modify globalBuffer. Copies of globalBuffer are made and sent out to each of the workers, but any modifications to these copies on the workers will never modify the globalBuffer that exists on the driver (the one you have defined outside the map on the RDD.) Here's what I do (with a few additional modifications):
val joined = data1.cogroup(data2) map { x =>
val iterA = x._2._1
val iterB = x._2._2
var count, positiveCount = 0
val tempList = ListBuffer[(Int, Double)]()
for (tupleB <- iterB) {
tempList.clear
for(tupleA <- iterA) {
val tValue = someFunctionReturnDouble(tupleB._2, tupleA._2)
tempList += ((tupleA._1, tValue))
}
val sortList = tempList.sortWith(_._2 > _._2).iterator.take(20)
if (sortList.exists(_._1 == tupleB._1)) positiveCount += 1
count += 1
}
positiveCount.toDouble/count
}
At this point you can obtain of local copy of the proportions by using joined.collect.

Exception in thread "main" java.lang.NumberFormatException

I am new to scala. When i try to run the example program PageRank its showing the following error..
Exception in thread "main" java.lang.NumberFormatException: For input
string: "5" at
scala.collection.immutable.StringLike$class.parseBoolean(StringLike.scala:240)
at
scala.collection.immutable.StringLike$class.toBoolean(StringLike.scala:228)
at scala.collection.immutable.StringOps.toBoolean(StringOps.scala:31)
at
spark.bagel.examples.WikipediaPageRank$.main(WikipediaPageRank.scala:30)
at
spark.bagel.examples.WikipediaPageRank.main(WikipediaPageRank.scala)
import spark._
import spark.SparkContext._
import spark.bagel._
import spark.bagel.Bagel._
import scala.xml.{XML,NodeSeq}
object WikipediaPageRank {
def main(args: Array[String]) {
if (args.length < 5) {
System.err.println("Usage: WikipediaPageRank <inputFile> <threshold> <numPartitions> <host> <usePartitioner>")
System.exit(-1)
}
System.setProperty("spark.serializer", "spark.KryoSerializer")
System.setProperty("spark.kryo.registrator", classOf[PRKryoRegistrator].getName)
val inputFile = args(0)
val threshold = args(1).toDouble
val numPartitions = args(2).toInt
val host = args(3)
val usePartitioner = args(4).toBoolean
val sc = new SparkContext(host, "WikipediaPageRank")
// Parse the Wikipedia page data into a graph
val input = sc.textFile(inputFile)
println("Counting vertices...")
val numVertices = input.count()
println("Done counting vertices.")
println("Parsing input file...")
var vertices = input.map(line => {
val fields = line.split("\t")
val (title, body) = (fields(1), fields(3).replace("\\n", "\n"))
val links =
if (body == "\\N")
NodeSeq.Empty
else
try {
XML.loadString(body) \\ "link" \ "target"
} catch {
case e: org.xml.sax.SAXParseException =>
System.err.println("Article \""+title+"\" has malformed XML in body:\n"+body)
NodeSeq.Empty
}
val outEdges = links.map(link => new String(link.text)).toArray
val id = new String(title)
(id, new PRVertex(1.0 / numVertices, outEdges))
})
if (usePartitioner)
vertices = vertices.partitionBy(new HashPartitioner(sc.defaultParallelism)).cache
else
vertices = vertices.cache
println("Done parsing input file.")
// Do the computation
val epsilon = 0.01 / numVertices
val messages = sc.parallelize(Array[(String, PRMessage)]())
val utils = new PageRankUtils
val result =
Bagel.run(
sc, vertices, messages, combiner = new PRCombiner(),
numPartitions = numPartitions)(
utils.computeWithCombiner(numVertices, epsilon))
// Print the result
System.err.println("Articles with PageRank >= "+threshold+":")
val top =
(result
.filter { case (id, vertex) => vertex.value >= threshold }
.map { case (id, vertex) => "%s\t%s\n".format(id, vertex.value) }
.collect.mkString)
println(top)
}
}
Please help me in solving the error.