Cannot call methods on a stopped SparkContext - scala

When I run the following test, it throws "Cannot call methods on a stopped SparkContext". The possible problem is that I use TestSuiteBase and Streaming Spark Context. At the line val gridEvalsRDD = ssc.sparkContext.parallelize(gridEvals) I need to use SparkContext that I access via ssc.sparkContext and this is where I have the problem (see the warning and error messages below)
class StreamingTest extends TestSuiteBase with BeforeAndAfter {
test("Test 1") {
//...
val gridEvals = for (initialWeights <- gridParams("initialWeights");
stepSize <- gridParams("stepSize");
numIterations <- gridParams("numIterations")) yield {
val lr = new StreamingLinearRegressionWithSGD()
.setInitialWeights(initialWeights.asInstanceOf[Vector])
.setStepSize(stepSize.asInstanceOf[Double])
.setNumIterations(numIterations.asInstanceOf[Int])
ssc = setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
lr.trainOn(inputDStream)
lr.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val cvRMSE = calculateRMSE(output, nPoints)
println(s"RMSE = $cvRMSE")
(initialWeights, stepSize, numIterations, cvRMSE)
}
val gridEvalsRDD = ssc.sparkContext.parallelize(gridEvals)
}
}
16/04/27 10:40:17 WARN StreamingContext: StreamingContext has already
been stopped 16/04/27 10:40:17 INFO SparkContext: SparkContext already
stopped.
Cannot call methods on a stopped SparkContext
UPDATE:
This is the base class TestSuiteBase:
trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging {
// Name of the framework for Spark context
def framework: String = this.getClass.getSimpleName
// Master for Spark context
def master: String = "local[2]"
// Batch duration
def batchDuration: Duration = Seconds(1)
// Directory where the checkpoint data will be saved
lazy val checkpointDir: String = {
val dir = Utils.createTempDir()
logDebug(s"checkpointDir: $dir")
dir.toString
}
// Number of partitions of the input parallel collections created for testing
def numInputPartitions: Int = 2
// Maximum time to wait before the test times out
def maxWaitTimeMillis: Int = 10000
// Whether to use manual clock or not
def useManualClock: Boolean = true
// Whether to actually wait in real time before changing manual clock
def actuallyWait: Boolean = false
// A SparkConf to use in tests. Can be modified before calling setupStreams to configure things.
val conf = new SparkConf()
.setMaster(master)
.setAppName(framework)
// Timeout for use in ScalaTest `eventually` blocks
val eventuallyTimeout: PatienceConfiguration.Timeout = timeout(Span(10, ScalaTestSeconds))
// Default before function for any streaming test suite. Override this
// if you want to add your stuff to "before" (i.e., don't call before { } )
def beforeFunction() {
if (useManualClock) {
logInfo("Using manual clock")
conf.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
} else {
logInfo("Using real clock")
conf.set("spark.streaming.clock", "org.apache.spark.util.SystemClock")
}
}
// Default after function for any streaming test suite. Override this
// if you want to add your stuff to "after" (i.e., don't call after { } )
def afterFunction() {
System.clearProperty("spark.streaming.clock")
}
before(beforeFunction)
after(afterFunction)
/**
* Run a block of code with the given StreamingContext and automatically
* stop the context when the block completes or when an exception is thrown.
*/
def withStreamingContext[R](ssc: StreamingContext)(block: StreamingContext => R): R = {
try {
block(ssc)
} finally {
try {
ssc.stop(stopSparkContext = true)
} catch {
case e: Exception =>
logError("Error stopping StreamingContext", e)
}
}
}
/**
* Run a block of code with the given TestServer and automatically
* stop the server when the block completes or when an exception is thrown.
*/
def withTestServer[R](testServer: TestServer)(block: TestServer => R): R = {
try {
block(testServer)
} finally {
try {
testServer.stop()
} catch {
case e: Exception =>
logError("Error stopping TestServer", e)
}
}
}
/**
* Set up required DStreams to test the DStream operation using the two sequences
* of input collections.
*/
def setupStreams[U: ClassTag, V: ClassTag](
input: Seq[Seq[U]],
operation: DStream[U] => DStream[V],
numPartitions: Int = numInputPartitions
): StreamingContext = {
// Create StreamingContext
val ssc = new StreamingContext(conf, batchDuration)
if (checkpointDir != null) {
ssc.checkpoint(checkpointDir)
}
// Setup the stream computation
val inputStream = new TestInputStream(ssc, input, numPartitions)
val operatedStream = operation(inputStream)
val outputStream = new TestOutputStreamWithPartitions(operatedStream,
new ArrayBuffer[Seq[Seq[V]]] with SynchronizedBuffer[Seq[Seq[V]]])
outputStream.register()
ssc
}
/**
* Set up required DStreams to test the binary operation using the sequence
* of input collections.
*/
def setupStreams[U: ClassTag, V: ClassTag, W: ClassTag](
input1: Seq[Seq[U]],
input2: Seq[Seq[V]],
operation: (DStream[U], DStream[V]) => DStream[W]
): StreamingContext = {
// Create StreamingContext
val ssc = new StreamingContext(conf, batchDuration)
if (checkpointDir != null) {
ssc.checkpoint(checkpointDir)
}
// Setup the stream computation
val inputStream1 = new TestInputStream(ssc, input1, numInputPartitions)
val inputStream2 = new TestInputStream(ssc, input2, numInputPartitions)
val operatedStream = operation(inputStream1, inputStream2)
val outputStream = new TestOutputStreamWithPartitions(operatedStream,
new ArrayBuffer[Seq[Seq[W]]] with SynchronizedBuffer[Seq[Seq[W]]])
outputStream.register()
ssc
}
/**
* Runs the streams set up in `ssc` on manual clock for `numBatches` batches and
* returns the collected output. It will wait until `numExpectedOutput` number of
* output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached.
*
* Returns a sequence of items for each RDD.
*/
def runStreams[V: ClassTag](
ssc: StreamingContext,
numBatches: Int,
numExpectedOutput: Int
): Seq[Seq[V]] = {
// Flatten each RDD into a single Seq
runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq)
}
/**
* Runs the streams set up in `ssc` on manual clock for `numBatches` batches and
* returns the collected output. It will wait until `numExpectedOutput` number of
* output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached.
*
* Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each
* representing one partition.
*/
def runStreamsWithPartitions[V: ClassTag](
ssc: StreamingContext,
numBatches: Int,
numExpectedOutput: Int
): Seq[Seq[Seq[V]]] = {
assert(numBatches > 0, "Number of batches to run stream computation is zero")
assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero")
logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput)
// Get the output buffer
val outputStream = ssc.graph.getOutputStreams.
filter(_.isInstanceOf[TestOutputStreamWithPartitions[_]]).
head.asInstanceOf[TestOutputStreamWithPartitions[V]]
val output = outputStream.output
try {
// Start computation
ssc.start()
// Advance manual clock
val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
logInfo("Manual clock before advancing = " + clock.getTimeMillis())
if (actuallyWait) {
for (i <- 1 to numBatches) {
logInfo("Actually waiting for " + batchDuration)
clock.advance(batchDuration.milliseconds)
Thread.sleep(batchDuration.milliseconds)
}
} else {
clock.advance(numBatches * batchDuration.milliseconds)
}
logInfo("Manual clock after advancing = " + clock.getTimeMillis())
// Wait until expected number of output items have been generated
val startTime = System.currentTimeMillis()
while (output.size < numExpectedOutput &&
System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput)
ssc.awaitTerminationOrTimeout(50)
}
val timeTaken = System.currentTimeMillis() - startTime
logInfo("Output generated in " + timeTaken + " milliseconds")
output.foreach(x => logInfo("[" + x.mkString(",") + "]"))
assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms")
assert(output.size === numExpectedOutput, "Unexpected number of outputs generated")
Thread.sleep(100) // Give some time for the forgetting old RDDs to complete
} finally {
ssc.stop(stopSparkContext = true)
}
output
}
/**
* Verify whether the output values after running a DStream operation
* is same as the expected output values, by comparing the output
* collections either as lists (order matters) or sets (order does not matter)
*/
def verifyOutput[V: ClassTag](
output: Seq[Seq[V]],
expectedOutput: Seq[Seq[V]],
useSet: Boolean
) {
logInfo("--------------------------------")
logInfo("output.size = " + output.size)
logInfo("output")
output.foreach(x => logInfo("[" + x.mkString(",") + "]"))
logInfo("expected output.size = " + expectedOutput.size)
logInfo("expected output")
expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
logInfo("--------------------------------")
// Match the output with the expected output
for (i <- 0 until output.size) {
if (useSet) {
assert(
output(i).toSet === expectedOutput(i).toSet,
s"Set comparison failed\n" +
s"Expected output (${expectedOutput.size} items):\n${expectedOutput.mkString("\n")}\n" +
s"Generated output (${output.size} items): ${output.mkString("\n")}"
)
} else {
assert(
output(i).toList === expectedOutput(i).toList,
s"Ordered list comparison failed\n" +
s"Expected output (${expectedOutput.size} items):\n${expectedOutput.mkString("\n")}\n" +
s"Generated output (${output.size} items): ${output.mkString("\n")}"
)
}
}
logInfo("Output verified successfully")
}
/**
* Test unary DStream operation with a list of inputs, with number of
* batches to run same as the number of expected output values
*/
def testOperation[U: ClassTag, V: ClassTag](
input: Seq[Seq[U]],
operation: DStream[U] => DStream[V],
expectedOutput: Seq[Seq[V]],
useSet: Boolean = false
) {
testOperation[U, V](input, operation, expectedOutput, -1, useSet)
}
/**
* Test unary DStream operation with a list of inputs
* #param input Sequence of input collections
* #param operation Binary DStream operation to be applied to the 2 inputs
* #param expectedOutput Sequence of expected output collections
* #param numBatches Number of batches to run the operation for
* #param useSet Compare the output values with the expected output values
* as sets (order matters) or as lists (order does not matter)
*/
def testOperation[U: ClassTag, V: ClassTag](
input: Seq[Seq[U]],
operation: DStream[U] => DStream[V],
expectedOutput: Seq[Seq[V]],
numBatches: Int,
useSet: Boolean
) {
val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size
withStreamingContext(setupStreams[U, V](input, operation)) { ssc =>
val output = runStreams[V](ssc, numBatches_, expectedOutput.size)
verifyOutput[V](output, expectedOutput, useSet)
}
}
/**
* Test binary DStream operation with two lists of inputs, with number of
* batches to run same as the number of expected output values
*/
def testOperation[U: ClassTag, V: ClassTag, W: ClassTag](
input1: Seq[Seq[U]],
input2: Seq[Seq[V]],
operation: (DStream[U], DStream[V]) => DStream[W],
expectedOutput: Seq[Seq[W]],
useSet: Boolean
) {
testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet)
}
/**
* Test binary DStream operation with two lists of inputs
* #param input1 First sequence of input collections
* #param input2 Second sequence of input collections
* #param operation Binary DStream operation to be applied to the 2 inputs
* #param expectedOutput Sequence of expected output collections
* #param numBatches Number of batches to run the operation for
* #param useSet Compare the output values with the expected output values
* as sets (order matters) or as lists (order does not matter)
*/
def testOperation[U: ClassTag, V: ClassTag, W: ClassTag](
input1: Seq[Seq[U]],
input2: Seq[Seq[V]],
operation: (DStream[U], DStream[V]) => DStream[W],
expectedOutput: Seq[Seq[W]],
numBatches: Int,
useSet: Boolean
) {
val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size
withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc =>
val output = runStreams[W](ssc, numBatches_, expectedOutput.size)
verifyOutput[W](output, expectedOutput, useSet)
}
}
}

These are a few things that you should check -
Verify if you have resources available that you are specifying in spark-config
Do a search for stop() keyword in your codebase and check it should not be on sparkcontext
Spark has Spark-UI component where you can see what job ran, if it failed or succeeded, along with its log. That will tell you why is it failing.

Cannot call methods on a stopped SparkContext it is consequence of some error which happend earlier. Look at the logs in $SPARK_HOME$/logs and $SPARK_HOME$/work.

Restarting the spark context on interpreter binding panel worked for me it looks like something bellow you just have to click on the refresh button and save.

The issue is only one SparkSession or SparkContext is allowed per JVM. Make sure to the instance being used is a singleton. For instance, wrap the singleton of SparkSession (or SparkContext) in an SharedSparkSession (or SharedSparkContext) object.

Related

Dataset data is updated after inserting into Mysql Database

I have a small scenario where i read text file and calculate average based on date and store the summary into Mysql database.
Following is code
val repo_sum = joined_data.map(SensorReport.generateReport)
repo_sum.show() --- STEP 1
repo_sum.write.mode(SaveMode.Overwrite).jdbc(url, "sensor_report", prop)
repo_sum.show() --- STEP 2
After calculating average in repo_sum dataframe following is the result of STEP 1
+----------+------------------+-----+-----+
| date| flo| hz|count|
+----------+------------------+-----+-----+
|2017-10-05|52.887049194476745|10.27| 5.0|
|2017-10-04| 55.4188048943416|10.27| 5.0|
|2017-10-03| 54.1529270444092|10.27| 10.0|
+----------+------------------+-----+-----+
Then the save command is executed and the dataset values at step 2 is
+----------+-----------------+------------------+-----+
| date| flo| hz|count|
+----------+-----------------+------------------+-----+
|2017-10-05|52.88704919447673|31.578524597238367| 10.0|
|2017-10-04| 55.4188048943416| 32.84440244717079| 10.0|
+----------+-----------------+------------------+-----+
Following is complete code
class StreamRead extends Serializable {
org.apache.spark.sql.catalyst.encoders.OuterScopes.addOuterScope(this);
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Application").setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(2))
val sqlContext = new SQLContext(ssc.sparkContext)
import sqlContext.implicits._
val sensorDStream = ssc.textFileStream("file:///C:/Users/M1026352/Desktop/Spark/StreamData").map(Sensor.parseSensor)
val url = "jdbc:mysql://localhost:3306/streamdata"
val prop = new java.util.Properties
prop.setProperty("user", "root")
prop.setProperty("password", "root")
val tweets = sensorDStream.foreachRDD {
rdd =>
if (rdd.count() != 0) {
val databaseVal = sqlContext.read.jdbc("jdbc:mysql://localhost:3306/streamdata", "sensor_report", prop)
val rdd_group = rdd.groupBy { x => x.date }
val repo_data = rdd_group.map { x =>
val sum_flo = x._2.map { x => x.flo }.reduce(_ + _)
val sum_hz = x._2.map { x => x.hz }.reduce(_ + _)
val sum_flo_count = x._2.size
print(sum_flo_count)
SensorReport(x._1, sum_flo, sum_hz, sum_flo_count)
}
val df = repo_data.toDF()
val joined_data = df.join(databaseVal, Seq("date"), "fullouter")
joined_data.show()
val repo_sum = joined_data.map(SensorReport.generateReport)
repo_sum.show()
repo_sum.write.mode(SaveMode.Overwrite).jdbc(url, "sensor_report", prop)
repo_sum.show()
}
}
ssc.start()
WorkerAndTaskExample.main(args)
ssc.awaitTermination()
}
case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double)
object Sensor extends Serializable {
def parseSensor(str: String): Sensor = {
val p = str.split(",")
Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble)
}
}
case class SensorReport(date: String, flo: Double, hz: Double, count: Double)
object SensorReport extends Serializable {
def generateReport(row: Row): SensorReport = {
print(row)
if (row.get(4) == null) {
SensorReport(row.getString(0), row.getDouble(1) / row.getDouble(3), row.getDouble(2) / row.getDouble(3), row.getDouble(3))
} else if (row.get(2) == null) {
SensorReport(row.getString(0), row.getDouble(4), row.getDouble(5), row.getDouble(6))
} else {
val count = row.getDouble(3) + row.getDouble(6)
val flow_avg_update = (row.getDouble(6) * row.getDouble(4) + row.getDouble(1)) / count
val flow_flo_update = (row.getDouble(6) * row.getDouble(5) + row.getDouble(1)) / count
print(count + " : " + flow_avg_update + " : " + flow_flo_update)
SensorReport(row.getString(0), flow_avg_update, flow_flo_update, count)
}
}
}
As far as i understand when save command is executed in spark the whole process runs again, is my understanding is correct please let me know.
In Spark all transformations are lazy, nothing will happen until an action is called. At the same time, this means that if multiple actions are called on the same RDD or dataframe, all computations will be performed multiple times. This includes loading the data and all transformations.
To avoid this, use cache() or persist() (same thing except that cache() can specify different types of storage, the default is RAM memory only). cache() will keep the RDD/dataframe in memory after the first time an action was used on it. Hence, avoiding running the same transformations multiple times.
In this case, since two actions are performed on the dataframe is causing this unexpected behavior, caching the dataframe would solve the problem:
val repo_sum = joined_data.map(SensorReport.generateReport).cache()

Empty Iterator : Asynchronous cassandra write

I am trying to implement asynchronous cassandra writes on objects (not RDD) using TableWriter. Code snippet below:
class CassandraOperations[T] extends Serializable with Logging {
/**
* Saves the data from object or Iterator of object to a Cassandra table asynchronously. Uses the specified column names.
* You can check whether this action is completed or not by callback on Future.
*/
def saveToCassandraAsync(
cc: CassandraConnector,
keyspaceName: String,
tableName: String,
columns: ColumnSelector = AllColumns,
data: Iterator[T],
writeConf: WriteConf = WriteConf(ttl = TTLOption.constant(80000)))(implicit rwf: RowWriterFactory[T]):
Future[Unit] = {
implicit val ec = ExecutionContext.global
val writer = TableWriter(cc, keyspaceName, tableName, columns, writeConf)
val futureAction = Future(writer.write(TaskContext.get(), data: Iterator[T]))
futureAction
}
}
And then wait using:
Await.result(resultFuture, TIMEOUT seconds)
the data is available when the execution reaches the write method on line :
val futureAction = Future(writer.write(TaskContext.get(), data: Iterator[T]))
But data is empty when the execution reaches the definition def write(taskContext: TaskContext, **data**: Iterator[T]) of function :
def write(taskContext: TaskContext, data: Iterator[T]) {
val updater = OutputMetricsUpdater(taskContext, writeConf)
connector.withSessionDo { session =>
val protocolVersion = session.getCluster.getConfiguration.getProtocolOptions.getProtocolVersion
val rowIterator = new CountingIterator(data)
val stmt = prepareStatement(session).setConsistencyLevel(writeConf.consistencyLevel)
val queryExecutor = new QueryExecutor(
session,
writeConf.parallelismLevel,
Some(updater.batchFinished(success = true, _, _, _)),
Some(updater.batchFinished(success = false, _, _, _)))
val routingKeyGenerator = new RoutingKeyGenerator(tableDef, columnNames)
val batchType = if (isCounterUpdate) Type.COUNTER else Type.UNLOGGED
val boundStmtBuilder = new BoundStatementBuilder(
rowWriter,
stmt,
protocolVersion = protocolVersion,
ignoreNulls = writeConf.ignoreNulls)
val batchStmtBuilder = new BatchStatementBuilder(
batchType,
routingKeyGenerator,
writeConf.consistencyLevel)
val batchKeyGenerator = batchRoutingKey(session, routingKeyGenerator) _
val batchBuilder = new GroupingBatchBuilder(
boundStmtBuilder,
batchStmtBuilder,
batchKeyGenerator,
writeConf.batchSize,
writeConf.batchGroupingBufferSize,
rowIterator)
val rateLimiter = new RateLimiter((writeConf.throughputMiBPS * 1024 * 1024).toLong, 1024 * 1024)
logDebug(s"Writing data partition to $keyspaceName.$tableName in batches of ${writeConf.batchSize}.")
for (stmtToWrite <- batchBuilder) {
queryExecutor.executeAsync(stmtToWrite)
assert(stmtToWrite.bytesCount > 0)
rateLimiter.maybeSleep(stmtToWrite.bytesCount)
}
queryExecutor.waitForCurrentlyExecutingTasks()
if (!queryExecutor.successful)
throw new IOException(s"Failed to write statements to $keyspaceName.$tableName.")
val duration = updater.finish() / 1000000000d
logInfo(f"Wrote ${rowIterator.count} rows to $keyspaceName.$tableName in $duration%.3f s.")
if (boundStmtBuilder.logUnsetToNullWarning) {
logWarning(boundStmtBuilder.UnsetToNullWarning)
}
}
}
}
so I see empty iterator.
Please guide on what can be the issue.

Task not serializable Flink

I am trying to do the pagerank Basic example in flink with little bit of modification(only in reading the input file, everything else is the same) i am getting the error as Task not serializable and below is the part of the output error
atorg.apache.flink.api.scala.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:179)
at org.apache.flink.api.scala.ClosureCleaner$.clean(ClosureCleaner.scala:171)
Below is my code
object hpdb {
def main(args: Array[String]) {
val env = ExecutionEnvironment.getExecutionEnvironment
val maxIterations = 10000
val DAMPENING_FACTOR: Double = 0.85
val EPSILON: Double = 0.0001
val outpath = "/home/vinoth/bigdata/assign10/pagerank.csv"
val links = env.readCsvFile[Tuple2[Long,Long]]("/home/vinoth/bigdata/assign10/ppi.csv",
fieldDelimiter = "\t", includedFields = Array(1,4)).as('sourceId,'targetId).toDataSet[Link]//source and target
val pages = env.readCsvFile[Tuple1[Long]]("/home/vinoth/bigdata/assign10/ppi.csv",
fieldDelimiter = "\t", includedFields = Array(1)).as('pageId).toDataSet[Id]//Pageid
val noOfPages = pages.count()
val pagesWithRanks = pages.map(p => Page(p.pageId, 1.0 / noOfPages))
val adjacencyLists = links
// initialize lists ._1 is the source id and ._2 is the traget id
.map(e => AdjacencyList(e.sourceId, Array(e.targetId)))
// concatenate lists
.groupBy("sourceId").reduce {
(l1, l2) => AdjacencyList(l1.sourceId, l1.targetIds ++ l2.targetIds)
}
// start iteration
val finalRanks = pagesWithRanks.iterateWithTermination(maxIterations) {
// **//the output shows error here**
currentRanks =>
val newRanks = currentRanks
// distribute ranks to target pages
.join(adjacencyLists).where("pageId").equalTo("sourceId") {
(page, adjacent, out: Collector[Page]) =>
for (targetId <- adjacent.targetIds) {
out.collect(Page(targetId, page.rank / adjacent.targetIds.length))
}
}
// collect ranks and sum them up
.groupBy("pageId").aggregate(SUM, "rank")
// apply dampening factor
//**//the output shows error here**
.map { p =>
Page(p.pageId, (p.rank * DAMPENING_FACTOR) + ((1 - DAMPENING_FACTOR) / pages.count()))
}
// terminate if no rank update was significant
val termination = currentRanks.join(newRanks).where("pageId").equalTo("pageId") {
(current, next, out: Collector[Int]) =>
// check for significant update
if (math.abs(current.rank - next.rank) > EPSILON) out.collect(1)
}
(newRanks, termination)
}
val result = finalRanks
// emit result
result.writeAsCsv(outpath, "\n", " ")
env.execute()
}
}
Any help in the right direction is highly appreciated? Thank you.
The problem is that you reference the DataSet pages from within a MapFunction. This is not possible, since a DataSet is only the logical representation of a data flow and cannot be accessed at runtime.
What you have to do to solve this problem is to assign the val pagesCount = pages.count value to a variable pagesCount and refer to this variable in your MapFunction.
What pages.count actually does, is to trigger the execution of the data flow graph, so that the number of elements in pages can be counted. The result is then returned to your program.

why spark sort is slower than scala original sort method

I heard many people says that Spark art good at sorting and distributed computing. Currently, out team do some research on spark and scala. We are going to implement an sorting service on spark. Right now, I have setup the spark cluster, and try to run and sorting example on spark cluster, but the cost time of sorting seems to long. Here is my code.
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
import scala.util.Random
/**
* Created by on 1/1/15.
*/
object AdvancedSort {
/**
* bin/spark-submit --master spark://master:7077 --executor-memory 1024M --class com.my.sortedspark.AdvancedSort lib/sortedspark.jar 100000 3
* #param args
*/
def main(args: Array[String]) {
val sampleSize = if (args.length > 0) args(0).toInt else 100000
val slice = if (args.length > 1) args(1).toInt else 3
sort(sampleSize, slice)
}
def sort(listSize: Int, slice: Int): Unit = {
val conf = new SparkConf().setAppName(getClass.getName)
val spark = new SparkContext(conf)
val step1 = System.currentTimeMillis()
val data = genRandom(listSize)
val step2 = System.currentTimeMillis()
println(">>>>>>>>>> genRandom : " + (step2 - step1))
val distData = spark.parallelize(data, slice)
val step3 = System.currentTimeMillis()
println(">>>>>>>>>> parallelize : " + (step3 - step2))
val result = distData.sortBy(x => x, true).collect
val step4 = System.currentTimeMillis()
println(">>>>>>>>>> sortBy and collect: " + (step4 - step3))
println(">>>>>>>>>> total time : " + (step4 - step1))
printlnArray(result, 0, 10)
spark.stop()
}
/**
* generate random number
* #return
*/
def genRandom(listSize: Int): List[Int] = {
val range = 100000
var listBuffer = new ListBuffer[Int]
val random = new Random()
for (i <- 1 to listSize) listBuffer += random.nextInt(range)
listBuffer.toList
}
def printlnList(list: List[Int], start: Int, offset: Int) {
for (i <- start until start + offset) println(">>>>>>>>> list : " + i + " | " + list(i))
}
def printlnArray(list: Array[Int], start: Int, offset: Int) {
for (i <- start until start + offset) println(">>>>>>>>> list : " + i + " | " + list(i))
}
}
After deploy the above code into spark cluster, I run the following command under Master's Spark Home:
bin/spark-submit --master spark://master:7077 --executor-memory 1024M --class com.my.sortedspark.AdvancedSort lib/sortedspark.jar 100000 3
The following is cost time which I got finally.
>>>>>>>>>> genRandom : 86
>>>>>>>>>> parallelize : 53
>>>>>>>>>> sortBy and collect: 6756
This looks strange, because if I run 100000 random data of Int via scala's sorted method on my local machine, the cost time is quicker the spark's.
import scala.collection.mutable.ListBuffer
import scala.util.Random
/**
* Created by on 1/5/15.
*/
object ScalaSort {
def main(args: Array[String]) {
val list = genRandom(1000000)
val start = System.currentTimeMillis()
val result = list.sorted
val end = System.currentTimeMillis()
println(">>>>>>>>>>>>>>>>>> cost time : " + (end - start))
}
/**
* generate random number
* #return
*/
def genRandom(listSize: Int): List[Int] = {
val range = 100000
var listBuffer = new ListBuffer[Int]
val random = new Random()
for (i <- 1 to listSize) listBuffer += random.nextInt(range)
listBuffer.toList
}
}
cost time of scala's sorted method on local machine
>>>>>>>>>>>>>>>>>> cost time : 169
In my opinion, the following factors costume spark's sorting time:
data transfor between Master and Worker
sorting on Worker is quick, by merge may be slow.
Does any master of spark know why this happen?
Spark is made for BigData.
When You insert tiny numbers to it, it acts slower because distribution over all cores/cluster takes more time than it'd take to sort it normally.
Try to use bigger data or instead of Spark use ParCollections in Scala :
collection.par.<any code here>

scala priority queue not ordering properly?

I'm seeing some strange behavior with Scala's collection.mutable.PriorityQueue. I'm performing an external sort and testing it with 1M records. Each time I run the test and verify the results between 10-20 records are not sorted properly. I replace the scala PriorityQueue implementation with a java.util.PriorityQueue and it works 100% of the time. Any ideas?
Here's the code (sorry it's a bit long...). I test it using the tools gensort -a 1000000 and valsort from http://sortbenchmark.org/
def externalSort(inFileName: String, outFileName: String)
(implicit ord: Ordering[String]): Int = {
val MaxTempFiles = 1024
val TempBufferSize = 4096
val inFile = new java.io.File(inFileName)
/** Partitions input file and sorts each partition */
def partitionAndSort()(implicit ord: Ordering[String]):
List[java.io.File] = {
/** Gets block size to use */
def getBlockSize: Long = {
var blockSize = inFile.length / MaxTempFiles
val freeMem = Runtime.getRuntime().freeMemory()
if (blockSize < freeMem / 2)
blockSize = freeMem / 2
else if (blockSize >= freeMem)
System.err.println("Not enough free memory to use external sort.")
blockSize
}
/** Sorts and writes data to temp files */
def writeSorted(buf: List[String]): java.io.File = {
// Create new temp buffer
val tmp = java.io.File.createTempFile("external", "sort")
tmp.deleteOnExit()
// Sort buffer and write it out to tmp file
val out = new java.io.PrintWriter(tmp)
try {
for (l <- buf.sorted) {
out.println(l)
}
} finally {
out.close()
}
tmp
}
val blockSize = getBlockSize
var tmpFiles = List[java.io.File]()
var buf = List[String]()
var currentSize = 0
// Read input and divide into blocks
for (line <- io.Source.fromFile(inFile).getLines()) {
if (currentSize > blockSize) {
tmpFiles ::= writeSorted(buf)
buf = List[String]()
currentSize = 0
}
buf ::= line
currentSize += line.length() * 2 // 2 bytes per char
}
if (currentSize > 0) tmpFiles ::= writeSorted(buf)
tmpFiles
}
/** Merges results of sorted partitions into one output file */
def mergeSortedFiles(fs: List[java.io.File])
(implicit ord: Ordering[String]): Int = {
/** Temp file buffer for reading lines */
class TempFileBuffer(val file: java.io.File) {
private val in = new java.io.BufferedReader(
new java.io.FileReader(file), TempBufferSize)
private var curLine: String = ""
readNextLine() // prep first value
def currentLine = curLine
def isEmpty = curLine == null
def readNextLine() {
if (curLine == null) return
try {
curLine = in.readLine()
} catch {
case _: java.io.EOFException => curLine = null
}
if (curLine == null) in.close()
}
override protected def finalize() {
try {
in.close()
} finally {
super.finalize()
}
}
}
val wrappedOrd = new Ordering[TempFileBuffer] {
def compare(o1: TempFileBuffer, o2: TempFileBuffer): Int = {
ord.compare(o1.currentLine, o2.currentLine)
}
}
val pq = new collection.mutable.PriorityQueue[TempFileBuffer](
)(wrappedOrd)
// Init queue with item from each file
for (tmp <- fs) {
val buf = new TempFileBuffer(tmp)
if (!buf.isEmpty) pq += buf
}
var count = 0
val out = new java.io.PrintWriter(new java.io.File(outFileName))
try {
// Read each value off of queue
while (pq.size > 0) {
val buf = pq.dequeue()
out.println(buf.currentLine)
count += 1
buf.readNextLine()
if (buf.isEmpty) {
buf.file.delete() // don't need anymore
} else {
// re-add to priority queue so we can process next line
pq += buf
}
}
} finally {
out.close()
}
count
}
mergeSortedFiles(partitionAndSort())
}
My tests don't show any bugs in PriorityQueue.
import org.scalacheck._
import Prop._
object PriorityQueueProperties extends Properties("PriorityQueue") {
def listToPQ(l: List[String]): PriorityQueue[String] = {
val pq = new PriorityQueue[String]
l foreach (pq +=)
pq
}
def pqToList(pq: PriorityQueue[String]): List[String] =
if (pq.isEmpty) Nil
else { val h = pq.dequeue; h :: pqToList(pq) }
property("Enqueued elements are dequeued in reverse order") =
forAll { (l: List[String]) => l.sorted == pqToList(listToPQ(l)).reverse }
property("Adding/removing elements doesn't break sorting") =
forAll { (l: List[String], s: String) =>
(l.size > 0) ==>
((s :: l.sorted.init).sorted == {
val pq = listToPQ(l)
pq.dequeue
pq += s
pqToList(pq).reverse
})
}
}
scala> PriorityQueueProperties.check
+ PriorityQueue.Enqueued elements are dequeued in reverse order: OK, passed
100 tests.
+ PriorityQueue.Adding/removing elements doesn't break sorting: OK, passed
100 tests.
If you could somehow reduce the input enough to make a test case, it would help.
I ran it with five million inputs several times, output matched expected always. My guess from looking at your code is that your Ordering is the problem (i.e. it's giving inconsistent answers.)