I am using Spark-1.0.0 over a 3 node cluster with 1 master and 2 slaves. I am trying to run LR algorithm over Spark Streaming.
package org.apache.spark.examples.streaming;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.LogisticRegressionModel;
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
/**
* Logistic regression based classification using ML Lib.
*/
public final class StreamingJavaLR {
static int i = 1;
// static LogisticRegressionModel model;
// private static final Pattern SPACE = Pattern.compile(" ");
static class ParsePoint implements Function<String, LabeledPoint> {
private static final Pattern COMMA = Pattern.compile(",");
private static final Pattern SPACE = Pattern.compile(" ");
#Override
public LabeledPoint call(String line) {
String[] parts = COMMA.split(line);
double y = Double.parseDouble(parts[0]);
String[] tok = SPACE.split(parts[1]);
double[] x = new double[tok.length];
for (int i = 0; i < tok.length; ++i) {
x[i] = Double.parseDouble(tok[i]);
}
return new LabeledPoint(y, Vectors.dense(x));
}
}
// Edited
static class ParsePointforInput implements Function<String, double[]> {
private static final Pattern SPACE = Pattern.compile(" ");
#Override
public double[] call(String line) {
String[] tok = SPACE.split(line);
double[] x = new double[tok.length];
for (int i = 0; i < tok.length; ++i) {
x[i] = Double.parseDouble(tok[i]);
}
return x;
}
}
public static void main(String[] args) {
if (args.length != 5) {
System.err
.println("Usage: JavaLR <master> <input_file_for_training> <step_size> <no_iters> <input_file_for_prediction>");
System.exit(1);
}
FileWriter file;
PrintWriter outputFile = null;
SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
Calendar cal=Calendar.getInstance();
final Date startTime;
System.out.println("<<<<<Let's Print>>>>>");
// SparkConf conf = new SparkConf()
// .setMaster(args[0])
// .setAppName("StreamingJavaLR")
// .set("spark.cleaner.ttl", "1000")
// .set("spark.executor.uri", "hdfs://192.168.145.191:9000/user/praveshj/spark/spark-0.9.1.tar.gz")
// .setJars(JavaSparkContext.jarOfClass(StreamingJavaLR.class));
//
// JavaSparkContext sc = new JavaSparkContext(conf);
JavaSparkContext sc = new JavaSparkContext(args[0],
"StreamingJavaLR",
System.getenv("SPARK_HOME"),
JavaSparkContext.jarOfClass(StreamingJavaLR.class));
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Reading File");
JavaRDD<String> lines = sc.textFile(args[1]);
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>File has been Read now mapping");
JavaRDD<LabeledPoint> points = lines.map(new ParsePoint()).cache();
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Mapping Done");
double stepSize = Double.parseDouble(args[2]);
int iterations = Integer.parseInt(args[3]);
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Read the arguments. stepSize = "+stepSize+" and iterations = "+iterations);
BufferedReader br = null;
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Training the Model");
final LogisticRegressionModel model = LogisticRegressionWithSGD.train(
points.rdd(), iterations, stepSize);
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Model Trained");
System.out.println("Final w: " + model.weights());
// printWeights(model.weights());
System.out.println("Intercept : " + model.intercept());
final Vector weightVector = model.weights();
// double[] weightArray = model.weights();
//
// final DoubleMatrix weightMatrix = new DoubleMatrix(weightArray);
sc.stop();
try {
Thread.sleep(1000);
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
}
// try {
// file = new FileWriter(args[5]);
// outputFile = new PrintWriter(file);
// cal = Calendar.getInstance();
// cal.getTime();
//// startTime = sdf.format(cal.getTime());
// startTime = cal.getTime();
// outputFile.println("Start Time : " + startTime);
// outputFile.flush();
// } catch (IOException E) {
// E.printStackTrace();
// }
// final JavaStreamingContext ssc = new JavaStreamingContext(sc,
// new Duration(1000));
startTime = cal.getTime();
final JavaStreamingContext ssc = new JavaStreamingContext(args[0],
"StreamingJavaLR", new Duration(1000),
System.getenv("SPARK_HOME"),
JavaStreamingContext.jarOfClass(StreamingJavaLR.class));
JavaDStream<String> lines_2 = ssc.textFileStream(args[4]);
JavaDStream<double[]> points_2 = lines_2.map(new ParsePointforInput());
// points_2.print();
// System.out.print(lines_2.count());
// System.exit(0);
points_2.foreachRDD(new Function<JavaRDD<double[]>, Void>() {
#Override
public Void call(JavaRDD rdd) {
List<double[]> temp = rdd.collect();
//If no more data is left for Prediction, Stop the Program
// if (rdd.count() == 0)
// ssc.stop();
FileWriter newfile = null;
BufferedWriter bw = null;
try {
newfile = new FileWriter(
"/home/pravesh/data/abc"
+ i++ + ".txt");
bw = new BufferedWriter(newfile);
} catch (IOException e) {
e.printStackTrace();
}
int inpNo = 0;
double result;
for (double[] dArray : temp) {
double[][] dataArray = new double[1][2];
for (int i = 0; i < dArray.length; i++)
dataArray[0][i] = dArray[i];
// DoubleMatrix dataMatrix = new DoubleMatrix(dataArray);
// result = model.predictPoint(dataMatrix, weightMatrix,
// model.intercept());
Vector dataVector = Vectors.dense(dArray);
result = model.predictPoint(dataVector, weightVector, model.intercept());
try {
Calendar cal2 = Calendar.getInstance();
// bw.write("INFO at " + cal2.getTime() + " : " + "Point " + inpNo + " (" + dataMatrix.get(0, 0)
// + ", " + dataMatrix.get(0, 1) + ")"
// + " belongs to : " + result + " and Start Time was " + startTime + "\n");
bw.write("INFO at " + cal2.getTime() + " : " + "Point " + inpNo + " (" + dataVector.toArray()[0]
+ ", " + dataVector.toArray()[1] + ")"
+ " belongs to : " + result + " and Start Time was " + startTime + "\n");
bw.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// newoutputFile.flush();
inpNo++;
}
try {
bw.close();
newfile.close();
} catch (IOException e) {
e.printStackTrace();
}
Void v = null;
return v;
}
});
ssc.start();
ssc.awaitTermination();
// cal = Calendar.getInstance();
// outputFile.println(" End Time : " + cal.getTime());
// outputFile.flush();
System.exit(0);
}
}
As you can see, I take input from files for training the model with JavaSparkContext and for testing the model with JavaStreamingContext.
I have used the data given in $SPARK_HOME/mllib/data/lr-data/random.data for training and testing. To obtain larger data sets, I have copied this data. The code works fine for every possible set of data in local mode. Over the cluster, however, it is not able to process the file containing 0.4million entries.
For every other data set (file with 0.8 million entries here), the output is like (Output after the StreamingContext is started):
14/06/06 11:36:09 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606113609-0001/0 on hostPort host-DSRV05.host.co.in:55206 with 8 cores, 512.0 MB RAM
14/06/06 11:36:09 INFO AppClient$ClientActor: Executor added: app-20140606113609-0001/1 on worker-20140606114445-host-DSRV04.host.co.in-39342 (host-DSRV04.host.co.in:39342) with 8 cores
14/06/06 11:36:09 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606113609-0001/1 on hostPort host-DSRV04.host.co.in:39342 with 8 cores, 512.0 MB RAM
14/06/06 11:36:09 INFO AppClient$ClientActor: Executor updated: app-20140606113609-0001/0 is now RUNNING
14/06/06 11:36:09 INFO AppClient$ClientActor: Executor updated: app-20140606113609-0001/1 is now RUNNING
14/06/06 11:36:09 INFO RecurringTimer: Started timer for JobGenerator at time 1402034770000
14/06/06 11:36:09 INFO JobGenerator: Started JobGenerator at 1402034770000 ms
14/06/06 11:36:09 INFO JobScheduler: Started JobScheduler
14/06/06 11:36:10 INFO FileInputDStream: Finding new files took 29 ms
14/06/06 11:36:10 INFO FileInputDStream: New files at time 1402034770000 ms:
file:/newdisk1/praveshj/pravesh/data/input/testing8lk.txt
14/06/06 11:36:10 INFO MemoryStore: ensureFreeSpace(33216) called with curMem=0, maxMem=309225062
14/06/06 11:36:10 INFO MemoryStore: Block broadcast_0 stored as values to memory (estimated size 32.4 KB, free 294.9 MB)
14/06/06 11:36:10 INFO FileInputFormat: Total input paths to process : 1
14/06/06 11:36:10 INFO JobScheduler: Added jobs for time 1402034770000 ms
14/06/06 11:36:10 INFO JobScheduler: Starting job streaming job 1402034770000 ms.0 from job set of time 1402034770000 ms
14/06/06 11:36:10 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:10 INFO DAGScheduler: Got job 0 (collect at StreamingJavaLR.java:170) with 1 output partitions (allowLocal=false)
14/06/06 11:36:10 INFO DAGScheduler: Final stage: Stage 0(collect at StreamingJavaLR.java:170)
14/06/06 11:36:10 INFO DAGScheduler: Parents of final stage: List()
14/06/06 11:36:10 INFO DAGScheduler: Missing parents: List()
14/06/06 11:36:10 INFO DAGScheduler: Submitting Stage 0 (MappedRDD[3] at map at MappedDStream.scala:35), which has no missing parents
14/06/06 11:36:10 INFO DAGScheduler: Submitting 1 missing tasks from Stage 0 (MappedRDD[3] at map at MappedDStream.scala:35)
14/06/06 11:36:10 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
14/06/06 11:36:10 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor#host-DSRV05.host.co.in:47657/user/Executor#-1277914179] with ID 0
14/06/06 11:36:10 INFO TaskSetManager: Starting task 0.0:0 as TID 0 on executor 0: host-DSRV05.host.co.in (PROCESS_LOCAL)
14/06/06 11:36:10 INFO TaskSetManager: Serialized task 0.0:0 as 3544 bytes in 1 ms
14/06/06 11:36:10 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor#host-DSRV04.host.co.in:46975/user/Executor#1659982546] with ID 1
14/06/06 11:36:10 INFO BlockManagerInfo: Registering block manager host-DSRV05.host.co.in:52786 with 294.9 MB RAM
14/06/06 11:36:10 INFO BlockManagerInfo: Registering block manager host-DSRV04.host.co.in:42008 with 294.9 MB RAM
14/06/06 11:36:11 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:11 INFO FileInputDStream: New files at time 1402034771000 ms:
14/06/06 11:36:11 INFO JobScheduler: Added jobs for time 1402034771000 ms
14/06/06 11:36:12 INFO FileInputDStream: Finding new files took 1 ms
14/06/06 11:36:12 INFO FileInputDStream: New files at time 1402034772000 ms:
14/06/06 11:36:12 INFO JobScheduler: Added jobs for time 1402034772000 ms
14/06/06 11:36:13 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:13 INFO FileInputDStream: New files at time 1402034773000 ms:
14/06/06 11:36:13 INFO JobScheduler: Added jobs for time 1402034773000 ms
14/06/06 11:36:14 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:14 INFO FileInputDStream: New files at time 1402034774000 ms:
14/06/06 11:36:14 INFO JobScheduler: Added jobs for time 1402034774000 ms
14/06/06 11:36:15 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:15 INFO FileInputDStream: New files at time 1402034775000 ms:
14/06/06 11:36:15 INFO JobScheduler: Added jobs for time 1402034775000 ms
14/06/06 11:36:15 INFO BlockManagerInfo: Added taskresult_0 in memory on host-DSRV05.host.co.in:52786 (size: 19.9 MB, free: 275.0 MB)
14/06/06 11:36:15 INFO SendingConnection: Initiating connection to [host-DSRV05.host.co.in/192.168.145.195:52786]
14/06/06 11:36:15 INFO SendingConnection: Connected to [host-DSRV05.host.co.in/192.168.145.195:52786], 1 messages pending
14/06/06 11:36:15 INFO ConnectionManager: Accepted connection from [host-DSRV05.host.co.in/192.168.145.195]
14/06/06 11:36:15 INFO BlockManagerInfo: Removed taskresult_0 on host-DSRV05.host.co.in:52786 in memory (size: 19.9 MB, free: 294.9 MB)
14/06/06 11:36:15 INFO DAGScheduler: Completed ResultTask(0, 0)
14/06/06 11:36:15 INFO TaskSetManager: Finished TID 0 in 4961 ms on host-DSRV05.host.co.in (progress: 1/1)
14/06/06 11:36:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
14/06/06 11:36:15 INFO DAGScheduler: Stage 0 (collect at StreamingJavaLR.java:170) finished in 5.533 s
14/06/06 11:36:15 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 5.548644244 s
14/06/06 11:36:16 INFO FileInputDStream: Finding new files took 1 ms
14/06/06 11:36:16 INFO FileInputDStream: New files at time 1402034776000 ms:
14/06/06 11:36:16 INFO JobScheduler: Added jobs for time 1402034776000 ms
14/06/06 11:36:17 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:17 INFO FileInputDStream: New files at time 1402034777000 ms:
14/06/06 11:36:17 INFO JobScheduler: Added jobs for time 1402034777000 ms
14/06/06 11:36:18 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:18 INFO FileInputDStream: New files at time 1402034778000 ms:
14/06/06 11:36:18 INFO JobScheduler: Added jobs for time 1402034778000 ms
14/06/06 11:36:19 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:36:19 INFO FileInputDStream: New files at time 1402034779000 ms:
14/06/06 11:36:19 INFO JobScheduler: Added jobs for time 1402034779000 ms
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034770000 ms.0 from job set of time 1402034770000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 9.331 s for time 1402034770000 ms (execution: 9.274 s)
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 2.7293E-5 s
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034771000 ms.0 from job set of time 1402034771000 ms
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034771000 ms.0 from job set of time 1402034771000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 8.333 s for time 1402034771000 ms (execution: 0.000 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034772000 ms.0 from job set of time 1402034772000 ms
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.4859E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034772000 ms.0 from job set of time 1402034772000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 7.335 s for time 1402034772000 ms (execution: 0.002 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034773000 ms.0 from job set of time 1402034773000 ms
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.5294E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034773000 ms.0 from job set of time 1402034773000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 6.336 s for time 1402034773000 ms (execution: 0.001 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034774000 ms.0 from job set of time 1402034774000 ms
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.117E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034774000 ms.0 from job set of time 1402034774000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 5.337 s for time 1402034774000 ms (execution: 0.001 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034775000 ms.0 from job set of time 1402034775000 ms
14/06/06 11:36:19 INFO FileInputDStream: Cleared 0 old files that were older than 1402034769000 ms:
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.1414E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034775000 ms.0 from job set of time 1402034775000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 4.338 s for time 1402034775000 ms (execution: 0.001 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034776000 ms.0 from job set of time 1402034776000 ms
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 4.2422E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034776000 ms.0 from job set of time 1402034776000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 3.338 s for time 1402034776000 ms (execution: 0.000 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034777000 ms.0 from job set of time 1402034777000 ms
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 3 from persistence list
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.1133E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034777000 ms.0 from job set of time 1402034777000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 2.339 s for time 1402034777000 ms (execution: 0.000 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034778000 ms.0 from job set of time 1402034778000 ms
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.124E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034778000 ms.0 from job set of time 1402034778000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 1.340 s for time 1402034778000 ms (execution: 0.001 s)
14/06/06 11:36:19 INFO JobScheduler: Starting job streaming job 1402034779000 ms.0 from job set of time 1402034779000 ms
14/06/06 11:36:19 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:36:19 INFO SparkContext: Job finished: collect at StreamingJavaLR.java:170, took 1.2101E-5 s
14/06/06 11:36:19 INFO JobScheduler: Finished job streaming job 1402034779000 ms.0 from job set of time 1402034779000 ms
14/06/06 11:36:19 INFO JobScheduler: Total delay: 0.341 s for time 1402034779000 ms (execution: 0.001 s)
14/06/06 11:36:19 INFO BlockManager: Removing RDD 3
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 2 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 2
14/06/06 11:36:19 INFO UnionRDD: Removing RDD 1 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 1
14/06/06 11:36:19 INFO FileInputDStream: Cleared 0 old files that were older than 1402034770000 ms:
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 6 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 6
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 5 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 5
14/06/06 11:36:19 INFO UnionRDD: Removing RDD 4 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 4
14/06/06 11:36:19 INFO FileInputDStream: Cleared 1 old files that were older than 1402034771000 ms: 1402034770000 ms
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 9 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 9
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 8 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 8
14/06/06 11:36:19 INFO UnionRDD: Removing RDD 7 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 7
14/06/06 11:36:19 INFO FileInputDStream: Cleared 1 old files that were older than 1402034772000 ms: 1402034771000 ms
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 12 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 12
14/06/06 11:36:19 INFO MappedRDD: Removing RDD 11 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 11
14/06/06 11:36:19 INFO UnionRDD: Removing RDD 10 from persistence list
14/06/06 11:36:19 INFO BlockManager: Removing RDD 10
14/06/06 11:36:19 INFO FileInputDStream: Cleared 1 old files that were older than 1402034773000 ms: 1402034772000 ms
14/06/06 11:36:20 INFO JobScheduler: Finished job streaming job 1402034780000 ms.0 from job set of time 1402034780000 ms
For file with 0.4 million entries, the ouput is (Output after StreamingContext is started) :
14/06/06 11:38:55 INFO AppClient$ClientActor: Executor added: app-20140606113855-0003/0 on worker-20140606114445-host-DSRV05.host.co.in-55206 (host-DSRV05.host.co.in:55206) with 8 cores
14/06/06 11:38:55 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606113855-0003/0 on hostPort host-DSRV05.host.co.in:55206 with 8 cores, 512.0 MB RAM
14/06/06 11:38:55 INFO AppClient$ClientActor: Executor added: app-20140606113855-0003/1 on worker-20140606114445-host-DSRV04.host.co.in-39342 (host-DSRV04.host.co.in:39342) with 8 cores
14/06/06 11:38:55 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606113855-0003/1 on hostPort host-DSRV04.host.co.in:39342 with 8 cores, 512.0 MB RAM
14/06/06 11:38:55 INFO AppClient$ClientActor: Executor updated: app-20140606113855-0003/0 is now RUNNING
14/06/06 11:38:55 INFO AppClient$ClientActor: Executor updated: app-20140606113855-0003/1 is now RUNNING
14/06/06 11:38:55 INFO RecurringTimer: Started timer for JobGenerator at time 1402034936000
14/06/06 11:38:55 INFO JobGenerator: Started JobGenerator at 1402034936000 ms
14/06/06 11:38:55 INFO JobScheduler: Started JobScheduler
14/06/06 11:38:56 INFO FileInputDStream: Finding new files took 31 ms
14/06/06 11:38:56 INFO FileInputDStream: New files at time 1402034936000 ms:
file:/newdisk1/praveshj/pravesh/data/input/testing4lk.txt
14/06/06 11:38:56 INFO MemoryStore: ensureFreeSpace(33216) called with curMem=0, maxMem=309225062
14/06/06 11:38:56 INFO MemoryStore: Block broadcast_0 stored as values to memory (estimated size 32.4 KB, free 294.9 MB)
14/06/06 11:38:56 INFO FileInputFormat: Total input paths to process : 1
14/06/06 11:38:56 INFO JobScheduler: Added jobs for time 1402034936000 ms
14/06/06 11:38:56 INFO JobScheduler: Starting job streaming job 1402034936000 ms.0 from job set of time 1402034936000 ms
14/06/06 11:38:56 INFO SparkContext: Starting job: collect at StreamingJavaLR.java:170
14/06/06 11:38:56 INFO DAGScheduler: Got job 0 (collect at StreamingJavaLR.java:170) with 1 output partitions (allowLocal=false)
14/06/06 11:38:56 INFO DAGScheduler: Final stage: Stage 0(collect at StreamingJavaLR.java:170)
14/06/06 11:38:56 INFO DAGScheduler: Parents of final stage: List()
14/06/06 11:38:56 INFO DAGScheduler: Missing parents: List()
14/06/06 11:38:56 INFO DAGScheduler: Submitting Stage 0 (MappedRDD[3] at map at MappedDStream.scala:35), which has no missing parents
14/06/06 11:38:56 INFO DAGScheduler: Submitting 1 missing tasks from Stage 0 (MappedRDD[3] at map at MappedDStream.scala:35)
14/06/06 11:38:56 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
14/06/06 11:38:57 INFO FileInputDStream: Finding new files took 1 ms
14/06/06 11:38:57 INFO FileInputDStream: New files at time 1402034937000 ms:
14/06/06 11:38:57 INFO JobScheduler: Added jobs for time 1402034937000 ms
14/06/06 11:38:57 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor#host-DSRV05.host.co.in:39424/user/Executor#-500165450] with ID 0
14/06/06 11:38:57 INFO TaskSetManager: Starting task 0.0:0 as TID 0 on executor 0: host-DSRV05.host.co.in (PROCESS_LOCAL)
14/06/06 11:38:57 INFO TaskSetManager: Serialized task 0.0:0 as 3544 bytes in 0 ms
14/06/06 11:38:57 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor#host-DSRV04.host.co.in:45532/user/Executor#1654371091] with ID 1
14/06/06 11:38:57 INFO BlockManagerInfo: Registering block manager host-DSRV05.host.co.in:53857 with 294.9 MB RAM
14/06/06 11:38:57 INFO BlockManagerInfo: Registering block manager host-DSRV04.host.co.in:38057 with 294.9 MB RAM
14/06/06 11:38:58 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:38:58 INFO FileInputDStream: New files at time 1402034938000 ms:
14/06/06 11:38:58 INFO JobScheduler: Added jobs for time 1402034938000 ms
14/06/06 11:38:59 INFO FileInputDStream: Finding new files took 1 ms
14/06/06 11:38:59 INFO FileInputDStream: New files at time 1402034939000 ms:
14/06/06 11:38:59 INFO JobScheduler: Added jobs for time 1402034939000 ms
14/06/06 11:39:00 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:39:00 INFO FileInputDStream: New files at time 1402034940000 ms:
14/06/06 11:39:00 INFO JobScheduler: Added jobs for time 1402034940000 ms
14/06/06 11:39:01 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:39:01 INFO FileInputDStream: New files at time 1402034941000 ms:
14/06/06 11:39:01 INFO JobScheduler: Added jobs for time 1402034941000 ms
14/06/06 11:39:02 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:39:02 INFO FileInputDStream: New files at time 1402034942000 ms:
14/06/06 11:39:02 INFO JobScheduler: Added jobs for time 1402034942000 ms
14/06/06 11:39:03 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:39:03 INFO FileInputDStream: New files at time 1402034943000 ms:
14/06/06 11:39:03 INFO JobScheduler: Added jobs for time 1402034943000 ms
14/06/06 11:39:04 INFO FileInputDStream: Finding new files took 0 ms
14/06/06 11:39:04 INFO FileInputDStream: New files at time 1402034944000 ms:
14/06/06 11:39:04 INFO JobScheduler: Added jobs for time 1402034944000 ms
14/06/06 11:39:05 INFO FileInputDStream: Finding new files took 1 ms
14/06/06 11:39:05 INFO FileInputDStream: New files at time 1402034945000 ms:
14/06/06 11:39:05 INFO JobScheduler: Added jobs for time 1402034945000 ms
14/06/06 11:39:06 INFO FileInputDStream: Finding new files took 1 ms
14/06/06 11:39:06 INFO FileInputDStream: New files at time 1402034946000 ms:
and this goes on forever. It doesn't print the output in the file it is supposed to.
The worker logs don't output anything different.
Any idea what might be the issue?
--
Thanks
Well i was able to get it to work by running spark over mesos. But it looks like a bug while running spark alone.
Related
I am new in spark streaming. I developed a small spark streaming application.
Here want to read a files from the directory and print the output to the console
(Or to the text file)
Below is code i developed in python
**import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext(appName='PysparkStreaming')
ssc = StreamingContext(sc,3)
lines= ssc.textFileStream('file:///home/cloudera/spark/logs/')
counts=lines.flatMap(lambda line :line.split(" ")).map(lambda x: (x,1)).reduceByKey( lambda a, b: a + b)
counts.pprint()
print(counts)
ssc.start()
ssc.awaitTermination()**
While running the code like
spark-submit as_log_stream.py
Getting below warnings continues every 3 seconds as declared in stream but expected output is not displaying which is word count.
Could you please let me know what wrong here it will be very helpfull?
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/zookeeper/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/flume-ng/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/parquet/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/avro/avro-tools-1.7.6-cdh5.13.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
18/12/10 02:01:36 INFO spark.SparkContext: Running Spark version 1.6.0
18/12/10 02:01:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/12/10 02:01:39 WARN util.Utils: Your hostname, quickstart.cloudera resolves to a loopback address: 127.0.0.1; using 192.168.186.133 instead (on interface eth1)
18/12/10 02:01:39 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
18/12/10 02:01:40 INFO spark.SecurityManager: Changing view acls to: cloudera
18/12/10 02:01:40 INFO spark.SecurityManager: Changing modify acls to: cloudera
18/12/10 02:01:40 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(cloudera); users with modify permissions: Set(cloudera)
18/12/10 02:01:40 INFO util.Utils: Successfully started service 'sparkDriver' on port 50432.
18/12/10 02:01:41 INFO slf4j.Slf4jLogger: Slf4jLogger started
18/12/10 02:01:41 INFO Remoting: Starting remoting
18/12/10 02:01:42 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem#192.168.186.133:45299]
18/12/10 02:01:42 INFO Remoting: Remoting now listens on addresses: [akka.tcp://sparkDriverActorSystem#192.168.186.133:45299]
18/12/10 02:01:42 INFO util.Utils: Successfully started service 'sparkDriverActorSystem' on port 45299.
18/12/10 02:01:42 INFO spark.SparkEnv: Registering MapOutputTracker
18/12/10 02:01:42 INFO spark.SparkEnv: Registering BlockManagerMaster
18/12/10 02:01:42 INFO storage.DiskBlockManager: Created local directory at /tmp/blockmgr-78e8d300-dbad-4008-a4ec-339f3599d8a1
18/12/10 02:01:42 INFO storage.MemoryStore: MemoryStore started with capacity 534.5 MB
18/12/10 02:01:43 INFO spark.SparkEnv: Registering OutputCommitCoordinator
18/12/10 02:01:44 INFO server.Server: jetty-8.y.z-SNAPSHOT
18/12/10 02:01:44 WARN component.AbstractLifeCycle: FAILED SelectChannelConnector#0.0.0.0:4040: java.net.BindException: Address already in use
java.net.BindException: Address already in use
at sun.nio.ch.Net.bind0(Native Method)
at sun.nio.ch.Net.bind(Net.java:444)
at sun.nio.ch.Net.bind(Net.java:436)
at sun.nio.ch.ServerSocketChannelImpl.bind(ServerSocketChannelImpl.java:214)
at sun.nio.ch.ServerSocketAdaptor.bind(ServerSocketAdaptor.java:74)
at org.spark-project.jetty.server.nio.SelectChannelConnector.open(SelectChannelConnector.java:187)
at org.spark-project.jetty.server.AbstractConnector.doStart(AbstractConnector.java:316)
at org.spark-project.jetty.server.nio.SelectChannelConnector.doStart(SelectChannelConnector.java:265)
at org.spark-project.jetty.util.component.AbstractLifeCycle.start(AbstractLifeCycle.java:64)
at org.apache.spark.ui.JettyUtils$.org$apache$spark$ui$JettyUtils$$httpConnect$1(JettyUtils.scala:291)
at org.apache.spark.ui.JettyUtils$$anonfun$7.apply(JettyUtils.scala:295)
at org.apache.spark.ui.JettyUtils$$anonfun$7.apply(JettyUtils.scala:295)
at org.apache.spark.util.Utils$$anonfun$startServiceOnPort$1.apply$mcVI$sp(Utils.scala:2040)
at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:141)
at org.apache.spark.util.Utils$.startServiceOnPort(Utils.scala:2032)
at org.apache.spark.ui.JettyUtils$.startJettyServer(JettyUtils.scala:295)
at org.apache.spark.ui.WebUI.bind(WebUI.scala:127)
at org.apache.spark.SparkContext$$anonfun$14.apply(SparkContext.scala:489)
at org.apache.spark.SparkContext$$anonfun$14.apply(SparkContext.scala:489)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:489)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:59)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:214)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
18/12/10 02:01:44 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
18/12/10 02:01:44 INFO server.AbstractConnector: Started SelectChannelConnector#0.0.0.0:4041
18/12/10 02:01:44 INFO util.Utils: Successfully started service 'SparkUI' on port 4041.
18/12/10 02:01:44 INFO ui.SparkUI: Started SparkUI at http://192.168.186.133:4041
18/12/10 02:01:46 INFO util.Utils: Copying /home/cloudera/practice/spark/scripts/as_log_stream.py to /tmp/spark-a57a538e-e7c7-496b-ad88-8d968ac379e8/userFiles-b449f5bb-434a-47d7-a5bc-ca9eb3f9e001/as_log_stream.py
18/12/10 02:01:46 INFO spark.SparkContext: Added file file:/home/cloudera/practice/spark/scripts/as_log_stream.py at file:/home/cloudera/practice/spark/scripts/as_log_stream.py with timestamp 1544436106336
18/12/10 02:01:48 INFO executor.Executor: Starting executor ID driver on host localhost
18/12/10 02:01:48 INFO util.Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 35274.
18/12/10 02:01:48 INFO netty.NettyBlockTransferService: Server created on 35274
18/12/10 02:01:48 INFO storage.BlockManagerMaster: Trying to register BlockManager
18/12/10 02:01:48 INFO storage.BlockManagerMasterEndpoint: Registering block manager localhost:35274 with 534.5 MB RAM, BlockManagerId(driver, localhost, 35274)
18/12/10 02:01:48 INFO storage.BlockManagerMaster: Registered BlockManager
18/12/10 02:01:51 INFO dstream.FileInputDStream: Duration for remembering RDDs set to 60000 ms for org.apache.spark.streaming.dstream.FileInputDStream#326a8451
<pyspark.streaming.dstream.TransformedDStream object at 0x1111f90>
18/12/10 02:01:52 INFO dstream.ForEachDStream: metadataCleanupDelay = -1
18/12/10 02:01:52 INFO python.PythonTransformedDStream: metadataCleanupDelay = -1
18/12/10 02:01:52 INFO dstream.MappedDStream: metadataCleanupDelay = -1
18/12/10 02:01:52 INFO dstream.FileInputDStream: metadataCleanupDelay = -1
18/12/10 02:01:52 INFO dstream.FileInputDStream: Slide time = 3000 ms
18/12/10 02:01:52 INFO dstream.FileInputDStream: Storage level = StorageLevel(false, false, false, false, 1)
18/12/10 02:01:52 INFO dstream.FileInputDStream: Checkpoint interval = null
18/12/10 02:01:52 INFO dstream.FileInputDStream: Remember duration = 60000 ms
18/12/10 02:01:52 INFO dstream.FileInputDStream: Initialized and validated org.apache.spark.streaming.dstream.FileInputDStream#326a8451
18/12/10 02:01:52 INFO dstream.MappedDStream: Slide time = 3000 ms
18/12/10 02:01:52 INFO dstream.MappedDStream: Storage level = StorageLevel(false, false, false, false, 1)
18/12/10 02:01:52 INFO dstream.MappedDStream: Checkpoint interval = null
18/12/10 02:01:52 INFO dstream.MappedDStream: Remember duration = 3000 ms
18/12/10 02:01:52 INFO dstream.MappedDStream: Initialized and validated org.apache.spark.streaming.dstream.MappedDStream#1b8496f
18/12/10 02:01:52 INFO python.PythonTransformedDStream: Slide time = 3000 ms
18/12/10 02:01:52 INFO python.PythonTransformedDStream: Storage level = StorageLevel(false, false, false, false, 1)
18/12/10 02:01:52 INFO python.PythonTransformedDStream: Checkpoint interval = null
18/12/10 02:01:52 INFO python.PythonTransformedDStream: Remember duration = 3000 ms
18/12/10 02:01:52 INFO python.PythonTransformedDStream: Initialized and validated org.apache.spark.streaming.api.python.PythonTransformedDStream#69dd174a
18/12/10 02:01:52 INFO dstream.ForEachDStream: Slide time = 3000 ms
18/12/10 02:01:52 INFO dstream.ForEachDStream: Storage level = StorageLevel(false, false, false, false, 1)
18/12/10 02:01:52 INFO dstream.ForEachDStream: Checkpoint interval = null
18/12/10 02:01:52 INFO dstream.ForEachDStream: Remember duration = 3000 ms
18/12/10 02:01:52 INFO dstream.ForEachDStream: Initialized and validated org.apache.spark.streaming.dstream.ForEachDStream#32243192
18/12/10 02:01:52 INFO util.RecurringTimer: Started timer for JobGenerator at time 1544436114000
18/12/10 02:01:52 INFO scheduler.JobGenerator: Started JobGenerator at 1544436114000 ms
18/12/10 02:01:52 INFO scheduler.JobScheduler: Started JobScheduler
18/12/10 02:01:52 INFO streaming.StreamingContext: StreamingContext started
18/12/10 02:01:54 INFO dstream.FileInputDStream: Finding new files took 74 ms
18/12/10 02:01:54 INFO dstream.FileInputDStream: New files at time 1544436114000 ms:
18/12/10 02:01:55 INFO scheduler.JobScheduler: Added jobs for time 1544436114000 ms
18/12/10 02:01:55 INFO scheduler.JobScheduler: Starting job streaming job 1544436114000 ms.0 from job set of time 1544436114000 ms
18/12/10 02:01:55 INFO spark.SparkContext: Starting job: runJob at PythonRDD.scala:393
18/12/10 02:01:55 INFO scheduler.DAGScheduler: Registering RDD 3 (call at /usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py:1724)
18/12/10 02:01:55 INFO scheduler.DAGScheduler: Got job 0 (runJob at PythonRDD.scala:393) with 1 output partitions
18/12/10 02:01:55 INFO scheduler.DAGScheduler: Final stage: ResultStage 1 (runJob at PythonRDD.scala:393)
18/12/10 02:01:55 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 0)
18/12/10 02:01:55 INFO scheduler.DAGScheduler: Missing parents: List()
18/12/10 02:01:55 INFO scheduler.DAGScheduler: Submitting ResultStage 1 (PythonRDD[7] at RDD at PythonRDD.scala:43), which has no missing parents
18/12/10 02:01:57 INFO dstream.FileInputDStream: Finding new files took 51 ms
18/12/10 02:01:57 INFO dstream.FileInputDStream: New files at time 1544436117000 ms:
18/12/10 02:01:57 INFO storage.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 5.9 KB, free 534.5 MB)
18/12/10 02:01:57 INFO scheduler.JobScheduler: Added jobs for time 1544436117000 ms
18/12/10 02:01:57 INFO storage.MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 3.5 KB, free 534.5 MB)
18/12/10 02:01:57 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:35274 (size: 3.5 KB, free: 534.5 MB)
18/12/10 02:01:57 INFO spark.SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1004
18/12/10 02:01:57 INFO scheduler.DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (PythonRDD[7] at RDD at PythonRDD.scala:43) (first 15 tasks are for partitions Vector(0))
18/12/10 02:01:57 INFO scheduler.TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
18/12/10 02:01:57 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 1.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 1963 bytes)
18/12/10 02:01:57 INFO executor.Executor: Running task 0.0 in stage 1.0 (TID 0)
18/12/10 02:01:57 INFO executor.Executor: Fetching file:/home/cloudera/practice/spark/scripts/as_log_stream.py with timestamp 1544436106336
18/12/10 02:01:57 INFO util.Utils: /home/cloudera/practice/spark/scripts/as_log_stream.py has been previously copied to /tmp/spark-a57a538e-e7c7-496b-ad88-8d968ac379e8/userFiles-b449f5bb-434a-47d7-a5bc-ca9eb3f9e001/as_log_stream.py
18/12/10 02:01:57 INFO storage.ShuffleBlockFetcherIterator: Getting 0 non-empty blocks out of 0 blocks
18/12/10 02:01:57 INFO storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 34 ms
18/12/10 02:01:59 INFO python.PythonRunner: Times: total = 1902, boot = 1608, init = 119, finish = 175
18/12/10 02:02:00 INFO python.PythonRunner: Times: total = 5, boot = -70, init = 75, finish = 0
18/12/10 02:02:00 INFO dstream.FileInputDStream: Finding new files took 36 ms
18/12/10 02:02:00 INFO dstream.FileInputDStream: New files at time 1544436120000 ms:
18/12/10 02:02:00 INFO executor.Executor: Finished task 0.0 in stage 1.0 (TID 0). 1213 bytes result sent to driver
18/12/10 02:02:00 INFO scheduler.JobScheduler: Added jobs for time 1544436120000 ms
18/12/10 02:02:00 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 1.0 (TID 0) in 2725 ms on localhost (executor driver) (1/1)
18/12/10 02:02:00 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
18/12/10 02:02:00 INFO scheduler.DAGScheduler: ResultStage 1 (runJob at PythonRDD.scala:393) finished in 2.854 s
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Job 0 finished: runJob at PythonRDD.scala:393, took 5.008004 s
-------------------------------------------
Time: 2018-12-10 02:01:54
-------------------------------------------
18/12/10 02:02:00 INFO scheduler.JobScheduler: Finished job streaming job 1544436114000 ms.0 from job set of time 1544436114000 ms
18/12/10 02:02:00 INFO scheduler.JobScheduler: Total delay: 6.455 s for time 1544436114000 ms (execution: 5.270 s)
18/12/10 02:02:00 INFO scheduler.JobScheduler: Starting job streaming job 1544436117000 ms.0 from job set of time 1544436117000 ms
18/12/10 02:02:00 INFO dstream.FileInputDStream: Cleared 0 old files that were older than 1544436054000 ms:
18/12/10 02:02:00 INFO spark.SparkContext: Starting job: runJob at PythonRDD.scala:393
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Registering RDD 11 (call at /usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py:1724)
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Got job 1 (runJob at PythonRDD.scala:393) with 1 output partitions
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Final stage: ResultStage 3 (runJob at PythonRDD.scala:393)
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 2)
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Missing parents: List()
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Submitting ResultStage 3 (PythonRDD[22] at RDD at PythonRDD.scala:43), which has no missing parents
18/12/10 02:02:00 INFO scheduler.ReceivedBlockTracker: Deleting batches ArrayBuffer()
18/12/10 02:02:00 INFO storage.MemoryStore: Block broadcast_1 stored as values in memory (estimated size 5.9 KB, free 534.5 MB)
18/12/10 02:02:00 INFO scheduler.InputInfoTracker: remove old batch metadata:
18/12/10 02:02:00 INFO storage.MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 3.5 KB, free 534.5 MB)
18/12/10 02:02:00 INFO storage.BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:35274 (size: 3.5 KB, free: 534.5 MB)
18/12/10 02:02:00 INFO spark.SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1004
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Submitting 1 missing tasks from ResultStage 3 (PythonRDD[22] at RDD at PythonRDD.scala:43) (first 15 tasks are for partitions Vector(0))
18/12/10 02:02:00 INFO scheduler.TaskSchedulerImpl: Adding task set 3.0 with 1 tasks
18/12/10 02:02:00 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 3.0 (TID 1, localhost, executor driver, partition 0, PROCESS_LOCAL, 1963 bytes)
18/12/10 02:02:00 INFO executor.Executor: Running task 0.0 in stage 3.0 (TID 1)
18/12/10 02:02:00 INFO storage.ShuffleBlockFetcherIterator: Getting 0 non-empty blocks out of 0 blocks
18/12/10 02:02:00 INFO storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/12/10 02:02:00 INFO python.PythonRunner: Times: total = 42, boot = -574, init = 616, finish = 0
18/12/10 02:02:00 INFO python.PythonRunner: Times: total = 44, boot = 6, init = 38, finish = 0
18/12/10 02:02:00 INFO executor.Executor: Finished task 0.0 in stage 3.0 (TID 1). 1213 bytes result sent to driver
18/12/10 02:02:00 INFO scheduler.DAGScheduler: ResultStage 3 (runJob at PythonRDD.scala:393) finished in 0.115 s
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Job 1 finished: runJob at PythonRDD.scala:393, took 0.190777 s
18/12/10 02:02:00 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 3.0 (TID 1) in 128 ms on localhost (executor driver) (1/1)
18/12/10 02:02:00 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool
-------------------------------------------
Time: 2018-12-10 02:01:57
-------------------------------------------
18/12/10 02:02:00 INFO scheduler.JobScheduler: Finished job streaming job 1544436117000 ms.0 from job set of time 1544436117000 ms
18/12/10 02:02:00 INFO scheduler.JobScheduler: Total delay: 3.815 s for time 1544436117000 ms (execution: 0.335 s)
18/12/10 02:02:00 INFO scheduler.JobScheduler: Starting job streaming job 1544436120000 ms.0 from job set of time 1544436120000 ms
18/12/10 02:02:00 INFO python.PythonRDD: Removing RDD 6 from persistence list
18/12/10 02:02:00 INFO spark.SparkContext: Starting job: runJob at PythonRDD.scala:393
18/12/10 02:02:00 INFO rdd.MapPartitionsRDD: Removing RDD 1 from persistence list
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Registering RDD 18 (call at /usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py:1724)
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Got job 2 (runJob at PythonRDD.scala:393) with 1 output partitions
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Final stage: ResultStage 5 (runJob at PythonRDD.scala:393)
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 4)
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Missing parents: List()
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Submitting ResultStage 5 (PythonRDD[23] at RDD at PythonRDD.scala:43), which has no missing parents
18/12/10 02:02:00 INFO storage.BlockManager: Removing RDD 6
18/12/10 02:02:00 INFO dstream.FileInputDStream: Cleared 0 old files that were older than 1544436057000 ms:
18/12/10 02:02:00 INFO scheduler.ReceivedBlockTracker: Deleting batches ArrayBuffer()
18/12/10 02:02:00 INFO scheduler.InputInfoTracker: remove old batch metadata:
18/12/10 02:02:00 INFO storage.MemoryStore: Block broadcast_2 stored as values in memory (estimated size 5.9 KB, free 534.5 MB)
18/12/10 02:02:00 INFO storage.BlockManager: Removing RDD 1
18/12/10 02:02:00 INFO storage.MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 3.5 KB, free 534.5 MB)
18/12/10 02:02:00 INFO storage.BlockManagerInfo: Added broadcast_2_piece0 in memory on localhost:35274 (size: 3.5 KB, free: 534.5 MB)
18/12/10 02:02:00 INFO spark.SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1004
18/12/10 02:02:00 INFO scheduler.DAGScheduler: Submitting 1 missing tasks from ResultStage 5 (PythonRDD[23] at RDD at PythonRDD.scala:43) (first 15 tasks are for partitions Vector(0))
18/12/10 02:02:00 INFO scheduler.TaskSchedulerImpl: Adding task set 5.0 with 1 tasks
18/12/10 02:02:00 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 5.0 (TID 2, localhost, executor driver, partition 0, PROCESS_LOCAL, 1963 bytes)
18/12/10 02:02:00 INFO executor.Executor: Running task 0.0 in stage 5.0 (TID 2)
18/12/10 02:02:00 INFO storage.ShuffleBlockFetcherIterator: Getting 0 non-empty blocks out of 0 blocks
18/12/10 02:02:00 INFO storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/12/10 02:02:00 INFO python.PythonRunner: Times: total = 41, boot = -103, init = 144, finish = 0
18/12/10 02:02:01 INFO python.PythonRunner: Times: total = 56, boot = 22, init = 34, finish = 0
18/12/10 02:02:01 INFO executor.Executor: Finished task 0.0 in stage 5.0 (TID 2). 1213 bytes result sent to driver
18/12/10 02:02:01 INFO scheduler.DAGScheduler: ResultStage 5 (runJob at PythonRDD.scala:393) finished in 0.128 s
18/12/10 02:02:01 INFO scheduler.DAGScheduler: Job 2 finished: runJob at PythonRDD.scala:393, took 0.174702 s
18/12/10 02:02:01 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 5.0 (TID 2) in 129 ms on localhost (executor driver) (1/1)
18/12/10 02:02:01 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 5.0, whose tasks have all completed, from pool
-------------------------------------------
Time: 2018-12-10 02:02:00
-------------------------------------------
18/12/10 02:02:01 INFO scheduler.JobScheduler: Finished job streaming job 1544436120000 ms.0 from job set of time 1544436120000 ms
18/12/10 02:02:01 INFO scheduler.JobScheduler: Total delay: 1.045 s for time 1544436120000 ms (execution: 0.230 s)
18/12/10 02:02:01 INFO python.PythonRDD: Removing RDD 14 from persistence list
18/12/10 02:02:01 INFO storage.BlockManager: Removing RDD 14
18/12/10 02:02:01 INFO rdd.MapPartitionsRDD: Removing RDD 9 from persistence list
18/12/10 02:02:01 INFO storage.BlockManager: Removing RDD 9
18/12/10 02:02:01 INFO dstream.FileInputDStream: Cleared 0 old files that were older than 1544436060000 ms:
18/12/10 02:02:01 INFO scheduler.ReceivedBlockTracker: Deleting batches ArrayBuffer()
18/12/10 02:02:01 INFO scheduler.InputInfoTracker: remove old batch metadata:
Script used to generate the file: which is working as expected.
from random import randint
import time
def main():
createFile()
def createFile():
print('creating files')
with open('//home//cloudera//practice//spark//source//server_log_name_12_0008.log', 'r') as logfile:
loglines=logfile.readlines()
linecount=0
while linecount <=70:
totalline = len(loglines)
linenumber = randint(0,totalline -10)
with open('//home//cloudera//spark//logs//log{0}.txt'.format(linecount),'w') as writefile:
writefile.write(' '.join(line for line in loglines[linenumber:totalline]))
print('creating file log{0}.txt'.format(linecount))
linecount+=1
time.sleep(2)
__name__ == '__main__'
main()
Observation: can see the files add in spark streaming log and getting output set to the driver. which driver the output sent? How I can collect the output in file? please suggest
18/12/12 17:22:10 INFO dstream.FileInputDStream: Finding new files took 4611 ms
18/12/12 17:22:10 WARN dstream.FileInputDStream: Time taken to find new files exceeds the batch size. Consider increasing the batch size or reducing the number of files in the monitored directory.
18/12/12 17:22:10 INFO dstream.FileInputDStream: New files at time 1544664126000 ms:
18/12/12 17:22:11 INFO scheduler.JobScheduler: Added jobs for time 1544664126000 ms
18/12/12 17:22:11 INFO dstream.FileInputDStream: Finding new files took 53 ms
**18/12/12 17:22:11 INFO dstream.FileInputDStream: New files at time 1544664129000 ms:
file:/home/cloudera/practice/spark/logs/log0.txt
file:/home/cloudera/practice/spark/logs/log1.txt**
18/12/12 17:22:11 INFO python.PythonRunner: Times: total = 9183, boot = 8301, init = 262, finish = 620
18/12/12 17:22:11 INFO python.PythonRunner: Times: total = 39, boot = 38, init = 1, finish = 0
**18/12/12 17:22:11 INFO executor.Executor: Finished task 0.0 in stage 1.0 (TID 0). 1213 bytes result sent to driver**
18/12/12 17:22:12 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 1.0 (TID 0) in 9720 ms on localhost (executor driver) (1/1)
18/12/12 17:22:12 INFO scheduler.DAGScheduler: ResultStage 1 (runJob at PythonRDD.scala:393) finished in 9.797 s
18/12/12 17:22:12 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
18/12/12 17:22:12 INFO scheduler.DAGScheduler: Job 0 finished: runJob at PythonRDD.scala:393, took 11.534252 s
My first take is that you're not adding any new files to file:///home/cloudera/spark/logs/ while your spark streaming program is running.
textFileStream only picks up new data after your job starts. After the your job is running, try copying some files to the directory.
Also, are you sure you're not working with HDFS? I see Cloudera and Spark, so that usually means Hadoop. If so, you need to make sure you're hdfs://home/cloudera/spark/logs or if you don't have your Hadoop Namenode configured, it should be hdfs://host:port/home/cloudera/spark/logs/
Folks!
I am attempting to move Spark processing into cluster (Standalone). Previously, jobs were running successfully over cluster set up with 1 Worker node + Master on the same machine. For example one of the jobs was running without any issues on 4gb and 2 cores. The same with local[*] mode.
Now I set up the cluster in Kubernetes with 24.0 GB RAM and 6 cores - 3 Workers + Master. And getting errors. For that simple job I am using all of the resources available in cluster.
Spark 2.2.0,Client mode.
spark-submit \
--name RSS_Analysys\
--class org.MainClass \
--conf "spark.driver.extraJavaOptions=-Ddata=file:///aws/efs/data/"\
--conf spark.executor.memory=8g\
--conf spark.executor.cores=2\
--master spark://AWS-ELB:7077\
--deploy-mode client \
--packages com.squareup.okhttp:okhttp:2.7.5\
file:///aws/efs/app/rss_app.jar
Driver output:
17/07/20 16:42:13 INFO TaskSetManager: Finished task 199.0 in stage 95.0 (TID 4017) in 4 ms on 172.12.0.1 (executor 2) (200/200)
17/07/20 16:42:13 INFO TaskSchedulerImpl: Removed TaskSet 95.0, whose tasks have all completed, from pool
17/07/20 16:42:13 INFO DAGScheduler: ShuffleMapStage 95 (head at RSSProcessor.scala:76) finished in 0.670 s
17/07/20 16:42:13 INFO DAGScheduler: looking for newly runnable stages
17/07/20 16:42:13 INFO DAGScheduler: running: Set(ShuffleMapStage 96)
17/07/20 16:42:13 INFO DAGScheduler: waiting: Set(ResultStage 97)
17/07/20 16:42:13 INFO DAGScheduler: failed: Set()
17/07/20 16:42:25 WARN StandaloneAppClient$ClientEndpoint: Connection to 172.12.0.1:7077 failed; waiting for master to reconnect...
17/07/20 16:42:25 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
17/07/20 16:42:26 ERROR TaskSchedulerImpl: Lost executor 0 on 172.12.0.2: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
17/07/20 16:42:26 ERROR TaskSchedulerImpl: Lost executor 1 on 172.12.0.3: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
17/07/20 16:42:26 INFO DAGScheduler: Executor lost: 0 (epoch 29)
17/07/20 16:42:26 INFO BlockManagerMasterEndpoint: Trying to remove executor 0 from BlockManagerMaster.
17/07/20 16:42:26 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_9_0 !
17/07/20 16:42:26 INFO BlockManagerMasterEndpoint: Removing block manager BlockManagerId(0, 172.12.0.2, 42948, None)
17/07/20 16:42:26 INFO BlockManagerMaster: Removed 0 successfully in removeExecutor
17/07/20 16:42:26 INFO DAGScheduler: Shuffle files lost for executor: 0 (epoch 29)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 93 is now unavailable on executor 0 (124/200, false)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 95 is now unavailable on executor 0 (126/200, false)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 94 is now unavailable on executor 0 (0/1, false)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 92 is now unavailable on executor 0 (0/1, false)
17/07/20 16:42:26 INFO DAGScheduler: Executor lost: 1 (epoch 35)
17/07/20 16:42:26 INFO BlockManagerMasterEndpoint: Trying to remove executor 1 from BlockManagerMaster.
17/07/20 16:42:26 INFO BlockManagerMasterEndpoint: Removing block manager BlockManagerId(1, 172.12.0.3, 37556, None)
17/07/20 16:42:26 INFO BlockManagerMaster: Removed 1 successfully in removeExecutor
17/07/20 16:42:26 INFO DAGScheduler: Shuffle files lost for executor: 1 (epoch 35)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 91 is now unavailable on executor 1 (0/1, false)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 93 is now unavailable on executor 1 (65/200, false)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 95 is now unavailable on executor 1 (41/200, false)
17/07/20 16:42:26 ERROR TaskSchedulerImpl: Lost executor 2 on 172.12.0.1: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
17/07/20 16:42:26 WARN TaskSetManager: Lost task 0.0 in stage 96.0 (TID 3617, 172.12.0.1, executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
17/07/20 16:42:26 INFO DAGScheduler: Executor lost: 2 (epoch 41)
17/07/20 16:42:26 INFO BlockManagerMasterEndpoint: Trying to remove executor 2 from BlockManagerMaster.
17/07/20 16:42:26 INFO BlockManagerMasterEndpoint: Removing block manager BlockManagerId(2, 172.12.0.1, 33264, None)
17/07/20 16:42:26 INFO BlockManagerMaster: Removed 2 successfully in removeExecutor
17/07/20 16:42:26 INFO DAGScheduler: Shuffle files lost for executor: 2 (epoch 41)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 93 is now unavailable on executor 2 (0/200, false)
17/07/20 16:42:26 INFO ShuffleMapStage: ShuffleMapStage 95 is now unavailable on executor 2 (0/200, false)
17/07/20 17:11:26 INFO BlockManagerInfo: Removed broadcast_50_piece0 on 172.12.0.4:36275 in memory (size: 12.1 KB, free: 366.2 MB)
17/07/20 17:11:26 INFO BlockManagerInfo: Removed broadcast_45_piece0 on 172.12.0.4:36275 in memory (size: 5.5 KB, free: 366.2 MB)
17/07/20 17:11:26 INFO BlockManagerInfo: Removed broadcast_49_piece0 on 172.12.0.4:36275 in memory (size: 12.1 KB, free: 366.2 MB)
Interesting, that 12 seconds passed before error (Maybe timeout settings required?)
17/07/20 16:42:13 INFO DAGScheduler: failed: Set()
17/07/20 16:42:25 WARN StandaloneAppClient$ClientEndpoint: Connection to 172.12.0.1:7077 failed; waiting for master to reconnect...
Before, I see that some stages are passed.
Any help or advise are highly appreciated.
In scala, what happens when I use a global map variable in scala without broadcasting?
E.g. if I get a variable using collect* (such as collectAsMap), it seems it is a global variable, and I can use it in all RDD.mapValues() functions without explicitly broadcasting it.
BUT I know spark works distributedly, and it should not be able to process a global memory-stored variable without broadcasting it. So, what happened?
Code example (this code call tf-idf in text, where df is stored in a Map):
//dfMap is a String->int Map in memory
//Array[(String, Int)] = Array((B,2), (A,3), (C,1))
val dfMap = dfrdd.collectAsMap;
//tfrdd is a rdd, and I can use dfMap in its mapValues function
//tfrdd: Array((doc1,Map(A -> 3.0)), (doc2,Map(A -> 2.0, B -> 1.0)))
val tfidfrdd = tfrdd.mapValues( e => e.map(x => x._1 -> x._2 * lineNum / dfMap.getOrElse(x._1, 1) ) );
tfidfrdd.saveAsTextFile("/somedir/result/");
The code works just fine. My question is what happened there? Does the driver send the dfMap to all workers just like broadcasting or else?
What's the difference if I code broadcasting explicitely like this:
dfMap = sc.broadcast(dfrdd.collectAsMap)
val tfidfrdd = tfrdd.mapValues( e => e.map(x => x._1 -> x._2 * lineNum / dfMap.value.getOrElse(x._1, 1) )
I've checked more resources and aggregating others' answers and put it in order. The difference between using an external variable DIRECTLY(as my so called "global variable"), and BROADCASTING a variable using sc.broadcast() is like this:
1) When using external variable directly, spark will send a copy of the serialized variable together with each TASK. Whereas by sc.broadcast, the variable is sent one copy per EXECUTOR. The number of Task is normally 10 times larger than the Executor.
So when the variable (say a map) is large enough (more than 20K), the former operation may cost a lot time on network transformation and cause frequent GC, which slows the spark down. Hence large variable(>20K) is suggested to be broadcasted explicitly.
2) When using external variable directly the variable is not persisted, it ends with the task and thus can not be reused. Whereas by sc.broadcast() the variable is auto-persisted in the executors' memory, it lasts until you explicitly unpersist it. Thus sc.broadcast variable is available across tasks and stages.
So if the variable is expected to be used multiple times, sc.broadcast() is suggested.
There is no difference between a Global Map Variable and a Broadcast variable. If we use a global variable in a map function of an RDD then it will be broadcasted to all nodes. For example:
scala> val list = List(1,2,3)
list: List[Int] = List(1, 2, 3)
scala> val rdd = sc.parallelize(List(1,2,3,4))
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at parallelize at <console>:24
scala> rdd.filter(elem => list.contains(elem)).collect
17/03/16 10:21:53 INFO SparkContext: Starting job: collect at <console>:29
17/03/16 10:21:53 INFO DAGScheduler: Got job 3 (collect at <console>:29) with 4 output partitions
17/03/16 10:21:53 INFO DAGScheduler: Final stage: ResultStage 3 (collect at <console>:29)
17/03/16 10:21:53 INFO DAGScheduler: Parents of final stage: List()
17/03/16 10:21:53 INFO DAGScheduler: Missing parents: List()
17/03/16 10:21:53 DEBUG DAGScheduler: submitStage(ResultStage 3)
17/03/16 10:21:53 DEBUG DAGScheduler: missing: List()
17/03/16 10:21:53 INFO DAGScheduler: Submitting ResultStage 3 (MapPartitionsRDD[5] at filter at <console>:29), which has no missing parents
17/03/16 10:21:53 DEBUG DAGScheduler: submitMissingTasks(ResultStage 3)
17/03/16 10:21:53 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 5.0 KB, free 366.3 MB)
17/03/16 10:21:53 DEBUG BlockManager: Put block broadcast_4 locally took 1 ms
17/03/16 10:21:53 DEBUG BlockManager: Putting block broadcast_4 without replication took 1 ms
17/03/16 10:21:53 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 2.5 KB, free 366.3 MB)
17/03/16 10:21:53 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.2.123:37645 (size: 2.5 KB, free: 366.3 MB)
17/03/16 10:21:53 DEBUG BlockManagerMaster: Updated info of block broadcast_4_piece0
17/03/16 10:21:53 DEBUG BlockManager: Told master about block broadcast_4_piece0
17/03/16 10:21:53 DEBUG BlockManager: Put block broadcast_4_piece0 locally took 2 ms
17/03/16 10:21:53 DEBUG ContextCleaner: Got cleaning task CleanBroadcast(1)
17/03/16 10:21:53 DEBUG BlockManager: Putting block broadcast_4_piece0 without replication took 2 ms
17/03/16 10:21:53 DEBUG ContextCleaner: Cleaning broadcast 1
17/03/16 10:21:53 DEBUG TorrentBroadcast: Unpersisting TorrentBroadcast 1
17/03/16 10:21:53 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:996
17/03/16 10:21:53 INFO DAGScheduler: Submitting 4 missing tasks from ResultStage 3 (MapPartitionsRDD[5] at filter at <console>:29)
17/03/16 10:21:53 DEBUG DAGScheduler: New pending partitions: Set(0, 1, 2, 3)
17/03/16 10:21:53 INFO TaskSchedulerImpl: Adding task set 3.0 with 4 tasks
17/03/16 10:21:53 DEBUG TaskSetManager: Epoch for TaskSet 3.0: 0
17/03/16 10:21:53 DEBUG TaskSetManager: Valid locality levels for TaskSet 3.0: NO_PREF, ANY
17/03/16 10:21:53 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_3.0, runningTasks: 0
17/03/16 10:21:53 INFO TaskSetManager: Starting task 0.0 in stage 3.0 (TID 12, localhost, executor driver, partition 0, PROCESS_LOCAL, 5886 bytes)
17/03/16 10:21:53 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 13, localhost, executor driver, partition 1, PROCESS_LOCAL, 5886 bytes)
17/03/16 10:21:53 INFO TaskSetManager: Starting task 2.0 in stage 3.0 (TID 14, localhost, executor driver, partition 2, PROCESS_LOCAL, 5886 bytes)
17/03/16 10:21:53 INFO TaskSetManager: Starting task 3.0 in stage 3.0 (TID 15, localhost, executor driver, partition 3, PROCESS_LOCAL, 5886 bytes)
17/03/16 10:21:53 INFO Executor: Running task 0.0 in stage 3.0 (TID 12)
17/03/16 10:21:53 DEBUG Executor: Task 12's epoch is 0
17/03/16 10:21:53 DEBUG BlockManager: Getting local block broadcast_4
17/03/16 10:21:53 DEBUG BlockManager: Level for block broadcast_4 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:21:53 INFO Executor: Running task 2.0 in stage 3.0 (TID 14)
17/03/16 10:21:53 INFO Executor: Running task 1.0 in stage 3.0 (TID 13)
17/03/16 10:21:53 DEBUG BlockManagerSlaveEndpoint: removing broadcast 1
17/03/16 10:21:53 DEBUG BlockManager: Removing broadcast 1
17/03/16 10:21:53 DEBUG BlockManager: Removing block broadcast_1
17/03/16 10:21:53 INFO Executor: Running task 3.0 in stage 3.0 (TID 15)
17/03/16 10:21:53 DEBUG Executor: Task 13's epoch is 0
17/03/16 10:21:53 DEBUG MemoryStore: Block broadcast_1 of size 5112 dropped from memory (free 384072627)
17/03/16 10:21:53 DEBUG BlockManager: Removing block broadcast_1_piece0
17/03/16 10:21:53 DEBUG MemoryStore: Block broadcast_1_piece0 of size 2535 dropped from memory (free 384075162)
17/03/16 10:21:53 INFO BlockManagerInfo: Removed broadcast_1_piece0 on 192.168.2.123:37645 in memory (size: 2.5 KB, free: 366.3 MB)
17/03/16 10:21:53 DEBUG BlockManagerMaster: Updated info of block broadcast_1_piece0
17/03/16 10:21:53 DEBUG BlockManager: Told master about block broadcast_1_piece0
17/03/16 10:21:53 DEBUG BlockManager: Getting local block broadcast_4
17/03/16 10:21:53 DEBUG BlockManager: Level for block broadcast_4 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:21:53 DEBUG Executor: Task 14's epoch is 0
17/03/16 10:21:53 DEBUG BlockManager: Getting local block broadcast_4
17/03/16 10:21:53 DEBUG BlockManager: Level for block broadcast_4 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:21:53 DEBUG Executor: Task 15's epoch is 0
17/03/16 10:21:53 DEBUG BlockManager: Getting local block broadcast_4
17/03/16 10:21:53 DEBUG BlockManager: Level for block broadcast_4 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:21:53 DEBUG BlockManagerSlaveEndpoint: Done removing broadcast 1, response is 0
17/03/16 10:21:53 DEBUG ContextCleaner: Cleaned broadcast 1
17/03/16 10:21:53 DEBUG ContextCleaner: Got cleaning task CleanBroadcast(3)
17/03/16 10:21:53 DEBUG ContextCleaner: Cleaning broadcast 3
17/03/16 10:21:53 DEBUG TorrentBroadcast: Unpersisting TorrentBroadcast 3
17/03/16 10:21:53 DEBUG BlockManagerSlaveEndpoint: removing broadcast 3
17/03/16 10:21:53 DEBUG BlockManager: Removing broadcast 3
17/03/16 10:21:53 DEBUG BlockManager: Removing block broadcast_3_piece0
17/03/16 10:21:53 DEBUG MemoryStore: Block broadcast_3_piece0 of size 3309 dropped from memory (free 384078471)
17/03/16 10:21:53 DEBUG BlockManagerSlaveEndpoint: Sent response: 0 to 192.168.2.123:40909
17/03/16 10:21:53 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 192.168.2.123:37645 in memory (size: 3.2 KB, free: 366.3 MB)
17/03/16 10:21:53 DEBUG BlockManagerMaster: Updated info of block broadcast_3_piece0
17/03/16 10:21:53 DEBUG BlockManager: Told master about block broadcast_3_piece0
17/03/16 10:21:53 DEBUG BlockManager: Removing block broadcast_3
17/03/16 10:21:53 DEBUG MemoryStore: Block broadcast_3 of size 6904 dropped from memory (free 384085375)
17/03/16 10:21:53 INFO Executor: Finished task 1.0 in stage 3.0 (TID 13). 912 bytes result sent to driver
17/03/16 10:21:53 DEBUG BlockManagerSlaveEndpoint: Done removing broadcast 3, response is 0
17/03/16 10:21:53 DEBUG BlockManagerSlaveEndpoint: Sent response: 0 to 192.168.2.123:40909
17/03/16 10:21:53 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_3.0, runningTasks: 3
17/03/16 10:21:53 DEBUG TaskSetManager: No tasks for locality level NO_PREF, so moving to locality level ANY
17/03/16 10:21:53 INFO TaskSetManager: Finished task 1.0 in stage 3.0 (TID 13) in 36 ms on localhost (executor driver) (1/4)
17/03/16 10:21:53 INFO Executor: Finished task 2.0 in stage 3.0 (TID 14). 912 bytes result sent to driver
17/03/16 10:21:53 DEBUG ContextCleaner: Cleaned broadcast 3
17/03/16 10:21:53 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_3.0, runningTasks: 2
17/03/16 10:21:53 INFO Executor: Finished task 0.0 in stage 3.0 (TID 12). 912 bytes result sent to driver
17/03/16 10:21:53 INFO TaskSetManager: Finished task 2.0 in stage 3.0 (TID 14) in 36 ms on localhost (executor driver) (2/4)
17/03/16 10:21:53 INFO Executor: Finished task 3.0 in stage 3.0 (TID 15). 908 bytes result sent to driver
17/03/16 10:21:53 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_3.0, runningTasks: 1
17/03/16 10:21:53 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_3.0, runningTasks: 0
17/03/16 10:21:53 INFO TaskSetManager: Finished task 3.0 in stage 3.0 (TID 15) in 36 ms on localhost (executor driver) (3/4)
17/03/16 10:21:53 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 12) in 45 ms on localhost (executor driver) (4/4)
17/03/16 10:21:53 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool
17/03/16 10:21:53 INFO DAGScheduler: ResultStage 3 (collect at <console>:29) finished in 0.045 s
17/03/16 10:21:53 DEBUG DAGScheduler: After removal of stage 3, remaining stages = 0
17/03/16 10:21:53 INFO DAGScheduler: Job 3 finished: collect at <console>:29, took 0.097564 s
res4: Array[Int] = Array(1, 2, 3)
In above log we can clearly see that global variable list is broadcasted . So, is the case when we explicitly broadcast the list.
scala> val br = sc.broadcast(list)
17/03/16 10:26:40 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 160.0 B, free 366.3 MB)
17/03/16 10:26:40 DEBUG BlockManager: Put block broadcast_5 locally took 1 ms
17/03/16 10:26:40 DEBUG BlockManager: Putting block broadcast_5 without replication took 1 ms
17/03/16 10:26:40 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 227.0 B, free 366.3 MB)
17/03/16 10:26:40 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on 192.168.2.123:37645 (size: 227.0 B, free: 366.3 MB)
17/03/16 10:26:40 DEBUG BlockManagerMaster: Updated info of block broadcast_5_piece0
17/03/16 10:26:40 DEBUG BlockManager: Told master about block broadcast_5_piece0
17/03/16 10:26:40 DEBUG BlockManager: Put block broadcast_5_piece0 locally took 1 ms
17/03/16 10:26:40 DEBUG BlockManager: Putting block broadcast_5_piece0 without replication took 1 ms
17/03/16 10:26:40 INFO SparkContext: Created broadcast 5 from broadcast at <console>:26
br: org.apache.spark.broadcast.Broadcast[List[Int]] = Broadcast(5)
scala> rdd.filter(elem => br.value.contains(elem)).collect
17/03/16 10:27:50 INFO SparkContext: Starting job: collect at <console>:31
17/03/16 10:27:50 INFO DAGScheduler: Got job 0 (collect at <console>:31) with 4 output partitions
17/03/16 10:27:50 INFO DAGScheduler: Final stage: ResultStage 0 (collect at <console>:31)
17/03/16 10:27:50 INFO DAGScheduler: Parents of final stage: List()
17/03/16 10:27:50 INFO DAGScheduler: Missing parents: List()
17/03/16 10:27:50 DEBUG DAGScheduler: submitStage(ResultStage 0)
17/03/16 10:27:50 DEBUG DAGScheduler: missing: List()
17/03/16 10:27:50 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[1] at filter at <console>:31), which has no missing parents
17/03/16 10:27:50 DEBUG DAGScheduler: submitMissingTasks(ResultStage 0)
17/03/16 10:27:50 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 6.7 KB, free 366.3 MB)
17/03/16 10:27:50 DEBUG BlockManager: Put block broadcast_1 locally took 6 ms
17/03/16 10:27:50 DEBUG BlockManager: Putting block broadcast_1 without replication took 6 ms
17/03/16 10:27:50 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 3.2 KB, free 366.3 MB)
17/03/16 10:27:50 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.2.123:37303 (size: 3.2 KB, free: 366.3 MB)
17/03/16 10:27:50 DEBUG BlockManagerMaster: Updated info of block broadcast_1_piece0
17/03/16 10:27:50 DEBUG BlockManager: Told master about block broadcast_1_piece0
17/03/16 10:27:50 DEBUG BlockManager: Put block broadcast_1_piece0 locally took 2 ms
17/03/16 10:27:50 DEBUG BlockManager: Putting block broadcast_1_piece0 without replication took 2 ms
17/03/16 10:27:50 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:996
17/03/16 10:27:50 INFO DAGScheduler: Submitting 4 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at filter at <console>:31)
17/03/16 10:27:50 DEBUG DAGScheduler: New pending partitions: Set(0, 1, 2, 3)
17/03/16 10:27:50 INFO TaskSchedulerImpl: Adding task set 0.0 with 4 tasks
17/03/16 10:27:50 DEBUG TaskSetManager: Epoch for TaskSet 0.0: 0
17/03/16 10:27:50 DEBUG TaskSetManager: Valid locality levels for TaskSet 0.0: NO_PREF, ANY
17/03/16 10:27:50 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_0.0, runningTasks: 0
17/03/16 10:27:50 DEBUG TaskSetManager: Valid locality levels for TaskSet 0.0: NO_PREF, ANY
17/03/16 10:27:51 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 5885 bytes)
17/03/16 10:27:51 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 5885 bytes)
17/03/16 10:27:51 INFO TaskSetManager: Starting task 2.0 in stage 0.0 (TID 2, localhost, executor driver, partition 2, PROCESS_LOCAL, 5885 bytes)
17/03/16 10:27:51 INFO TaskSetManager: Starting task 3.0 in stage 0.0 (TID 3, localhost, executor driver, partition 3, PROCESS_LOCAL, 5885 bytes)
17/03/16 10:27:51 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
17/03/16 10:27:51 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
17/03/16 10:27:51 INFO Executor: Running task 2.0 in stage 0.0 (TID 2)
17/03/16 10:27:51 INFO Executor: Running task 3.0 in stage 0.0 (TID 3)
17/03/16 10:27:51 DEBUG Executor: Task 0's epoch is 0
17/03/16 10:27:51 DEBUG Executor: Task 2's epoch is 0
17/03/16 10:27:51 DEBUG Executor: Task 3's epoch is 0
17/03/16 10:27:51 DEBUG Executor: Task 1's epoch is 0
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_1
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_1 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_1
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_1 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_1
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_1 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_1
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_1 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_0
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_0 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_0
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_0 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_0
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_0 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 DEBUG BlockManager: Getting local block broadcast_0
17/03/16 10:27:51 DEBUG BlockManager: Level for block broadcast_0 is StorageLevel(disk, memory, deserialized, 1 replicas)
17/03/16 10:27:51 INFO Executor: Finished task 3.0 in stage 0.0 (TID 3). 908 bytes result sent to driver
17/03/16 10:27:51 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 999 bytes result sent to driver
17/03/16 10:27:51 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 912 bytes result sent to driver
17/03/16 10:27:51 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 912 bytes result sent to driver
17/03/16 10:27:51 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_0.0, runningTasks: 3
17/03/16 10:27:51 DEBUG TaskSetManager: No tasks for locality level NO_PREF, so moving to locality level ANY
17/03/16 10:27:51 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_0.0, runningTasks: 2
17/03/16 10:27:51 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_0.0, runningTasks: 1
17/03/16 10:27:51 DEBUG TaskSchedulerImpl: parentName: , name: TaskSet_0.0, runningTasks: 0
17/03/16 10:27:51 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 165 ms on localhost (executor driver) (1/4)
17/03/16 10:27:51 INFO TaskSetManager: Finished task 3.0 in stage 0.0 (TID 3) in 180 ms on localhost (executor driver) (2/4)
17/03/16 10:27:51 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 249 ms on localhost (executor driver) (3/4)
17/03/16 10:27:51 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 186 ms on localhost (executor driver) (4/4)
17/03/16 10:27:51 INFO DAGScheduler: ResultStage 0 (collect at <console>:31) finished in 0.264 s
17/03/16 10:27:51 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
17/03/16 10:27:51 DEBUG DAGScheduler: After removal of stage 0, remaining stages = 0
17/03/16 10:27:51 INFO DAGScheduler: Job 0 finished: collect at <console>:31, took 0.381615 s
res1: Array[Int] = Array(1, 2, 3)
Same is the case with Broadcast variable.
When you broadcast, the data is cached by all the nodes. so when you are performing an action (collect, saveAsTextFile, head) operation the broadcasted values are already available to all the worker nodes.
But if you do not broadcast the value, when performing an action each worker node needs to perform a shuffle to get the data from the driver node.
First off it is a spark thing - not a scala one
The diff is values are broadcasted everytime they are used whereas explicit broadcasts are cached.
"Broadcast variables are created from a variable v by calling
SparkContext.broadcast(v). The broadcast variable is a wrapper around
v, and its value can be accessed by calling the value method ... After the broadcast variable is created, it should
be used instead of the value v in any functions run on the cluster so
that v is not shipped to the nodes more than once"
I want to perform a K-Means task and fail training the model and get kicked out of Sparks scala shell before I get my result metrics. I am not sure if the input format is the problem or something else. I use Spark 1.0.0 and my input textile (400MB) looks like this:
ID,Category,PruductSize,PurchaseAMount
86252,3711,15.4,4.18
86252,3504,28,1.25
86252,3703,10.75,8.85
86252,3703,10.5,5.55
86252,2201,64,2.79
12262064,7203,32,8.49
etc.
I am not sure if I can use the first two, because in the MLlib example file there only use floats. So I also tried the last two:
16 2.49
64 3.29
56 1
etc.
My error code in both cases is here:
scala> import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.clustering.KMeans
scala> import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vectors
scala>
scala> // Load and parse the data
scala> val data = sc.textFile("data/outkmeanssm.txt")
14/08/07 16:15:37 INFO MemoryStore: ensureFreeSpace(35456) called with curMem=0, maxMem=318111744
14/08/07 16:15:37 INFO MemoryStore: Block broadcast_0 stored as values to memory (estimated size 34.6 KB, free 303.3 MB)
data: org.apache.spark.rdd.RDD[String] = MappedRDD[1] at textFile at <console>:14
scala> val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
parsedData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MappedRDD[2] at map at <console>:16
scala>
scala> // Cluster the data into two classes using KMeans
scala> val numClusters = 2
numClusters: Int = 2
scala> val numIterations = 20
numIterations: Int = 20
scala> val clusters = KMeans.train(parsedData, numClusters, numIterations)
14/08/07 16:15:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
14/08/07 16:15:38 WARN LoadSnappy: Snappy native library not loaded
14/08/07 16:15:38 INFO FileInputFormat: Total input paths to process : 1
14/08/07 16:15:38 INFO SparkContext: Starting job: takeSample at KMeans.scala:260
14/08/07 16:15:38 INFO DAGScheduler: Got job 0 (takeSample at KMeans.scala:260) with 7 output partitions (allowLocal=false)
14/08/07 16:15:38 INFO DAGScheduler: Final stage: Stage 0(takeSample at KMeans.scala:260)
14/08/07 16:15:38 INFO DAGScheduler: Parents of final stage: List()
14/08/07 16:15:38 INFO DAGScheduler: Missing parents: List()
14/08/07 16:15:38 INFO DAGScheduler: Submitting Stage 0 (MappedRDD[6] at map at KMeans.scala:123), which has no missing parents
14/08/07 16:15:39 INFO DAGScheduler: Submitting 7 missing tasks from Stage 0 (MappedRDD[6] at map at KMeans.scala:123)
14/08/07 16:15:39 INFO TaskSchedulerImpl: Adding task set 0.0 with 7 tasks
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:0 as TID 0 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:0 as 2221 bytes in 3 ms
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:1 as TID 1 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:1 as 2221 bytes in 0 ms
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:2 as TID 2 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:2 as 2221 bytes in 0 ms
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:3 as TID 3 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:3 as 2221 bytes in 1 ms
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:4 as TID 4 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:4 as 2221 bytes in 0 ms
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:5 as TID 5 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:5 as 2221 bytes in 0 ms
14/08/07 16:15:39 INFO TaskSetManager: Starting task 0.0:6 as TID 6 on executor localhost: localhost (PROCESS_LOCAL)
14/08/07 16:15:39 INFO TaskSetManager: Serialized task 0.0:6 as 2221 bytes in 0 ms
14/08/07 16:15:39 INFO Executor: Running task ID 4
14/08/07 16:15:39 INFO Executor: Running task ID 1
14/08/07 16:15:39 INFO Executor: Running task ID 5
14/08/07 16:15:39 INFO Executor: Running task ID 6
14/08/07 16:15:39 INFO Executor: Running task ID 0
14/08/07 16:15:39 INFO Executor: Running task ID 3
14/08/07 16:15:39 INFO Executor: Running task ID 2
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO BlockManager: Found block broadcast_0 locally
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:0+33554432
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:100663296+33554432
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:201326592+24305610
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:33554432+33554432
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:67108864+33554432
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:134217728+33554432
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:167772160+33554432
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_0 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:0+33554432
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_2 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:67108864+33554432
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_1 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:33554432+33554432
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_4 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:134217728+33554432
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_6 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:201326592+24305610
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_3 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:100663296+33554432
14/08/07 16:15:39 INFO CacheManager: Partition rdd_3_5 not found, computing it
14/08/07 16:15:39 INFO HadoopRDD: Input split: file:/Users/admin/BD_Tools/spark-1.0.0/data/outkmeanssm.txt:167772160+33554432
14/08/07 16:16:53 ERROR Executor: Exception in task ID 5
java.lang.OutOfMemoryError: Java heap space
at scala.collection.mutable.ResizableArray$class.ensureSize(ResizableArray.scala:99)
at scala.collection.mutable.ArrayBuffer.ensureSize(ArrayBuffer.scala:47)
at scala.collection.mutable.ArrayBuffer.$plus$eq(ArrayBuffer.scala:83)
at scala.collection.mutable.ArrayBuffer.$plus$eq(ArrayBuffer.scala:47)
at scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:48)
at scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:107)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:227)
at org.apache.spark.rdd.ZippedRDD.compute(ZippedRDD.scala:66)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:695)
14/08/07 16:16:59 ERROR ExecutorUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-5,5,main]
java.lang.OutOfMemoryError: Java heap space
at scala.collection.mutable.ResizableArray$class.ensureSize(ResizableArray.scala:99)
at scala.collection.mutable.ArrayBuffer.ensureSize(ArrayBuffer.scala:47)
at scala.collection.mutable.ArrayBuffer.$plus$eq(ArrayBuffer.scala:83)
at scala.collection.mutable.ArrayBuffer.$plus$eq(ArrayBuffer.scala:47)
at scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:48)
at scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:107)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:227)
at org.apache.spark.rdd.ZippedRDD.compute(ZippedRDD.scala:66)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:695)
14/08/07 16:17:00 WARN TaskSetManager: Lost TID 5 (task 0.0:5)
Chairs-MacBook-Pro:spark-1.0.0 admin$
Chairs-MacBook-Pro:spark-1.0.0 admin$ // Evaluate clustering by computing Within Set Sum of Squared Errors
-bash: //: is a directory
Chairs-MacBook-Pro:spark-1.0.0 admin$ val WSSSE = clusters.computeCost(parsedData)
-bash: syntax error near unexpected token `('
Chairs-MacBook-Pro:spark-1.0.0 admin$ println("Within Set Sum of Squared Errors = " + WSSSE)
What am I missing?
The “java.lang.OutOfMemoryError: Java heap space” error you are facing will be triggered when you try to add more data into the heap space area in memory, but the size of this data is larger than the JVM can accommodate in the Java heap space.
This occurs due to the fact the applications deployed on Java Virtual Machine are allowed to use only a limited amount of memory. This limit is specified during application startup. To make things more complex, Java memory is separated into two different regions, one of which is called heap. And you have exhausted the heap.
The first solution should be obvious – when you have ran out of a particular resource, you should increase the availability of such a resource. In our case: when your application does not have enough Java heap space memory to run properly, fixing it is as easy as altering your JVM launch configuration and adding (or increasing if present) the following:
-Xmx1024m
So I have a simple spark job where I'm trying to work out how to write bytes into a sequence file. It was working fine, then suddenly the job hangs seemingly at the end - in particular at this line:
14/06/06 10:57:48 INFO SparkContext: Job finished: toArray at XXXX.scala:104, took 44.439736728 s
So I had a look at the stderr logs on the workers and I see this:
java.util.concurrent.TimeoutException: Futures timed out after [30 seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.storage.BlockManagerMaster.askDriverWithReply(BlockManagerMaster.scala:162)
at org.apache.spark.storage.BlockManagerMaster.sendHeartBeat(BlockManagerMaster.scala:52)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$heartBeat(BlockManager.scala:97)
at org.apache.spark.storage.BlockManager$$anonfun$initialize$1.apply$mcV$sp(BlockManager.scala:135)
at akka.actor.Scheduler$$anon$9.run(Scheduler.scala:80)
at akka.actor.LightArrayRevolverScheduler$$anon$3$$anon$2.run(Scheduler.scala:241)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
The job output has some weird INFO messages I've not seen before:
14/06/06 11:08:28 INFO TaskSetManager: Finished TID 2 in 2163 ms on ip-172-31-23-17.ec2.internal (progress: 0/5)
14/06/06 11:08:28 INFO DAGScheduler: Completed ResultTask(1, 0)
14/06/06 11:08:30 INFO TaskSetManager: Finished TID 3 in 3635 ms on ip-172-31-29-86.ec2.internal (progress: 1/5)
14/06/06 11:08:30 INFO DAGScheduler: Completed ResultTask(1, 1)
^^ Normal output see this in jobs all the time. But below lots of weird messages.
14/06/06 11:08:50 INFO BlockManagerMasterActor$BlockManagerInfo: Added taskresult_6 in memory on ip-172-31-30-95.ec2.internal:41661 (size: 253.9 MB, free: 2.6 GB)
14/06/06 11:08:50 INFO SendingConnection: Initiating connection to [ip-172-31-30-95.ec2.internal/172.31.30.95:41661]
14/06/06 11:08:50 INFO SendingConnection: Connected to [ip-172-31-30-95.ec2.internal/172.31.30.95:41661], 1 messages pending
14/06/06 11:08:50 INFO ConnectionManager: Accepted connection from [ip-172-31-30-95.ec2.internal/172.31.30.95]
14/06/06 11:08:52 INFO TaskSetManager: Finished TID 6 in 25831 ms on ip-172-31-30-95.ec2.internal (progress: 2/5)
14/06/06 11:08:52 INFO BlockManagerMasterActor$BlockManagerInfo: Removed taskresult_6 on ip-172-31-30-95.ec2.internal:41661 in memory (size: 253.9 MB, free: 2.9 GB)
14/06/06 11:08:53 INFO DAGScheduler: Completed ResultTask(1, 4)
14/06/06 11:08:57 INFO BlockManagerMasterActor$BlockManagerInfo: Added taskresult_4 in memory on ip-172-31-22-58.ec2.internal:46736 (size: 329.3 MB, free: 2.6 GB)
14/06/06 11:08:57 INFO SendingConnection: Initiating connection to [ip-172-31-22-58.ec2.internal/172.31.22.58:46736]
14/06/06 11:08:57 INFO SendingConnection: Connected to [ip-172-31-22-58.ec2.internal/172.31.22.58:46736], 1 messages pending
14/06/06 11:08:57 INFO ConnectionManager: Accepted connection from [ip-172-31-22-58.ec2.internal/172.31.22.58]
14/06/06 11:09:00 INFO TaskSetManager: Finished TID 4 in 33738 ms on ip-172-31-22-58.ec2.internal (progress: 3/5)
14/06/06 11:09:00 INFO BlockManagerMasterActor$BlockManagerInfo: Removed taskresult_4 on ip-172-31-22-58.ec2.internal:46736 in memory (size: 329.3 MB, free: 2.9 GB)
14/06/06 11:09:02 INFO DAGScheduler: Completed ResultTask(1, 2)
If I'm then very patient, eventually the job spits out some more weird stuff:
14/06/06 11:14:15 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(ip-172-31-30-95.ec2.internal,41661)
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/9 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO ConnectionManager: Removing ReceivingConnection to ConnectionManagerId(ip-172-31-30-95.ec2.internal,41661)
14/06/06 11:14:15 INFO ConnectionManager: Removing ReceivingConnection to ConnectionManagerId(ip-172-31-28-236.ec2.internal,35129)
14/06/06 11:14:15 INFO ConnectionManager: Key not valid ? sun.nio.ch.SelectionKeyImpl#6b071630
14/06/06 11:14:15 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(ip-172-31-28-236.ec2.internal,35129)
14/06/06 11:14:15 ERROR ConnectionManager: Corresponding SendingConnectionManagerId not found
14/06/06 11:14:15 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(ip-172-31-22-58.ec2.internal,46736)
14/06/06 11:14:15 INFO ConnectionManager: Removing ReceivingConnection to ConnectionManagerId(ip-172-31-22-58.ec2.internal,46736)
14/06/06 11:14:15 ERROR ConnectionManager: Corresponding SendingConnectionManagerId not found
14/06/06 11:14:15 INFO ConnectionManager: key already cancelled ? sun.nio.ch.SelectionKeyImpl#6b071630
java.nio.channels.CancelledKeyException
at org.apache.spark.network.ConnectionManager.run(ConnectionManager.scala:341)
at org.apache.spark.network.ConnectionManager$$anon$3.run(ConnectionManager.scala:98)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/9 removed: Command exited with code 50
14/06/06 11:14:15 ERROR SendingConnection: Exception while reading SendingConnection to ConnectionManagerId(ip-172-31-28-236.ec2.internal,35129)
java.nio.channels.ClosedChannelException
at sun.nio.ch.SocketChannelImpl.ensureReadOpen(SocketChannelImpl.java:252)
at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:295)
at org.apache.spark.network.SendingConnection.read(Connection.scala:398)
at org.apache.spark.network.ConnectionManager$$anon$5.run(ConnectionManager.scala:158)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(1, ip-172-31-30-95.ec2.internal, 41661, 0) with no recent heart beats: 132381ms exceeds 45000ms
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 9 disconnected, so removing it
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(6, ip-172-31-17-30.ec2.internal, 43082, 0) with no recent heart beats: 132382ms exceeds 45000ms
14/06/06 11:14:15 INFO ConnectionManager: Handling connection error on connection to ConnectionManagerId(ip-172-31-28-236.ec2.internal,35129)
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(<driver>, ip-172-31-23-17.ec2.internal, 55101, 0) with no recent heart beats: 132385ms exceeds 45000ms
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost an executor 9 (already removed): Uncaught exception
14/06/06 11:14:15 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(ip-172-31-28-236.ec2.internal,35129)
14/06/06 11:14:15 INFO ConnectionManager: Key not valid ? sun.nio.ch.SelectionKeyImpl#3c39a92
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(8, ip-172-31-22-58.ec2.internal, 46736, 0) with no recent heart beats: 132377ms exceeds 45000ms
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/10 on worker-20140606110717-ip-172-31-21-172.ec2.internal-7078 (ip-172-31-21-172.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO ConnectionManager: key already cancelled ? sun.nio.ch.SelectionKeyImpl#3c39a92
java.nio.channels.CancelledKeyException
at org.apache.spark.network.ConnectionManager.run(ConnectionManager.scala:267)
at org.apache.spark.network.ConnectionManager$$anon$3.run(ConnectionManager.scala:98)
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(9, ip-172-31-21-172.ec2.internal, 42635, 0) with no recent heart beats: 132384ms exceeds 45000ms
14/06/06 11:14:15 INFO ConnectionManager: Key not valid ? sun.nio.ch.SelectionKeyImpl#46000f2b
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/10 on hostPort ip-172-31-21-172.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(7, ip-172-31-28-236.ec2.internal, 35129, 0) with no recent heart beats: 132379ms exceeds 45000ms
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/10 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/4 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/4 removed: Command exited with code 50
14/06/06 11:14:15 INFO ConnectionManager: key already cancelled ? sun.nio.ch.SelectionKeyImpl#46000f2b
java.nio.channels.CancelledKeyException
at org.apache.spark.network.ConnectionManager.run(ConnectionManager.scala:267)
at org.apache.spark.network.ConnectionManager$$anon$3.run(ConnectionManager.scala:98)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 4 disconnected, so removing it
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost executor 4 on ip-172-31-28-73.ec2.internal: Uncaught exception
14/06/06 11:14:15 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(ip-172-31-28-236.ec2.internal,35129)
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/11 on worker-20140606110708-ip-172-31-28-73.ec2.internal-7078 (ip-172-31-28-73.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO DAGScheduler: Executor lost: 4 (epoch 0)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/11 on hostPort ip-172-31-28-73.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/3 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/3 removed: Command exited with code 50
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 1 disconnected, so removing it
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost executor 1 on ip-172-31-30-95.ec2.internal: remote Akka client disassociated
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 3 disconnected, so removing it
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost an executor 3 (already removed): Uncaught exception
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 7 disconnected, so removing it
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost executor 7 on ip-172-31-28-236.ec2.internal: remote Akka client disassociated
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(2, ip-172-31-23-17.ec2.internal, 44685, 0) with no recent heart beats: 132373ms exceeds 45000ms
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(0, ip-172-31-24-194.ec2.internal, 47896, 0) with no recent heart beats: 132382ms exceeds 45000ms
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 5 disconnected, so removing it
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost executor 5 on ip-172-31-29-86.ec2.internal: remote Akka client disassociated
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/12 on worker-20140606110708-ip-172-31-26-188.ec2.internal-7078 (ip-172-31-26-188.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(5, ip-172-31-29-86.ec2.internal, 48078, 0) with no recent heart beats: 132380ms exceeds 45000ms
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 8 disconnected, so removing it
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost executor 8 on ip-172-31-22-58.ec2.internal: remote Akka client disassociated
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/12 on hostPort ip-172-31-26-188.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO BlockManagerMasterActor: Trying to remove executor 4 from BlockManagerMaster.
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/6 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 2 disconnected, so removing it
14/06/06 11:14:15 INFO BlockManagerMaster: Removed 4 successfully in removeExecutor
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/6 removed: Command exited with code 50
14/06/06 11:14:15 INFO DAGScheduler: Executor lost: 1 (epoch 1)
14/06/06 11:14:15 INFO BlockManagerMasterActor: Trying to remove executor 1 from BlockManagerMaster.
14/06/06 11:14:15 INFO BlockManagerMaster: Removed 1 successfully in removeExecutor
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost executor 2 on ip-172-31-23-17.ec2.internal: remote Akka client disassociated
14/06/06 11:14:15 INFO DAGScheduler: Executor lost: 7 (epoch 2)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 0 disconnected, so removing it
14/06/06 11:14:15 INFO BlockManagerMasterActor: Trying to remove executor 7 from BlockManagerMaster.
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost an executor 0 (already removed): remote Akka client disassociated
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor 6 disconnected, so removing it
14/06/06 11:14:15 INFO BlockManagerMaster: Removed 7 successfully in removeExecutor
14/06/06 11:14:15 ERROR TaskSchedulerImpl: Lost an executor 6 (already removed): remote Akka client disassociated
14/06/06 11:14:15 INFO DAGScheduler: Executor lost: 5 (epoch 3)
14/06/06 11:14:15 INFO BlockManagerMasterActor: Trying to remove executor 5 from BlockManagerMaster.
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/13 on worker-20140606110717-ip-172-31-17-30.ec2.internal-7078 (ip-172-31-17-30.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO BlockManagerMaster: Removed 5 successfully in removeExecutor
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/13 on hostPort ip-172-31-17-30.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO DAGScheduler: Executor lost: 8 (epoch 4)
14/06/06 11:14:15 INFO BlockManagerMasterActor: Trying to remove executor 8 from BlockManagerMaster.
14/06/06 11:14:15 INFO BlockManagerMaster: Removed 8 successfully in removeExecutor
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/11 is now RUNNING
14/06/06 11:14:15 INFO DAGScheduler: Executor lost: 2 (epoch 5)
14/06/06 11:14:15 INFO BlockManagerMasterActor: Trying to remove executor 2 from BlockManagerMaster.
14/06/06 11:14:15 INFO BlockManagerMaster: Removed 2 successfully in removeExecutor
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/13 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/12 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/0 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/0 removed: Command exited with code 50
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/14 on worker-20140606110706-ip-172-31-24-194.ec2.internal-7078 (ip-172-31-24-194.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/14 on hostPort ip-172-31-24-194.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/14 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/5 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/5 removed: Command exited with code 50
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/15 on worker-20140606110706-ip-172-31-29-86.ec2.internal-7078 (ip-172-31-29-86.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/15 on hostPort ip-172-31-29-86.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/15 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/1 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/1 removed: Command exited with code 50
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/16 on worker-20140606110708-ip-172-31-30-95.ec2.internal-7078 (ip-172-31-30-95.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/16 on hostPort ip-172-31-30-95.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/16 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/8 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/8 removed: Command exited with code 50
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/17 on worker-20140606110708-ip-172-31-22-58.ec2.internal-7078 (ip-172-31-22-58.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/17 on hostPort ip-172-31-22-58.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/17 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/7 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/7 removed: Command exited with code 50
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor added: app-20140606110822-0000/18 on worker-20140606110706-ip-172-31-28-236.ec2.internal-7078 (ip-172-31-28-236.ec2.internal:7078) with 8 cores
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Granted executor ID app-20140606110822-0000/18 on hostPort ip-172-31-28-236.ec2.internal:7078 with 8 cores, 5.0 GB RAM
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/18 is now RUNNING
14/06/06 11:14:15 INFO AppClient$ClientActor: Executor updated: app-20140606110822-0000/2 is now FAILED (Command exited with code 50)
14/06/06 11:14:15 INFO SparkDeploySchedulerBackend: Executor app-20140606110822-0000/2 removed: Command exited with code 50
14/06/06 11:14:15 ERROR AppClient$ClientActor: Master removed our application: FAILED; stopping client
14/06/06 11:14:15 WARN SparkDeploySchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
Where it just hangs again ... again if I'm patient it then spits out the following and hangs again
14/06/06 11:14:15 WARN SparkDeploySchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
14/06/06 11:16:54 WARN BlockManagerMasterActor: Removing BlockManager BlockManagerId(3, ip-172-31-26-188.ec2.internal, 55392, 0) with no recent heart beats: 159686ms exceeds 45000ms
14/06/06 11:19:42 WARN BlockManagerMaster: Error sending message to BlockManagerMaster in 1 attempts
java.util.concurrent.TimeoutException: Futures timed out after [30 seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.storage.BlockManagerMaster.askDriverWithReply(BlockManagerMaster.scala:162)
at org.apache.spark.storage.BlockManagerMaster.sendHeartBeat(BlockManagerMaster.scala:52)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$heartBeat(BlockManager.scala:97)
at org.apache.spark.storage.BlockManager$$anonfun$initialize$1.apply$mcV$sp(BlockManager.scala:135)
at akka.actor.Scheduler$$anon$9.run(Scheduler.scala:80)
at akka.actor.LightArrayRevolverScheduler$$anon$3$$anon$2.run(Scheduler.scala:241)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
After 10 mins, my patience runs out and I kill -9 it (normal interrupt doesn't work).
The question is, how do I get my cluster back to when it worked? It seems spark is holding some state somewhere which we can't zap. We have tried deleting the spark cache files i.e. .../spark/spark-*, we have tried restarting all the workers and the master!
UPDATE:
I think the problem could be that the file I thought I was reading got corrupted in some way that meant it became about 370 MB. The toArray on such a large amount of data may have caused stuff to go crazy. After just deleting the file and trying again on other files, things returned to normal. Nevertheless leaving the question open as the behaviour thrown isn't what one would expect - one would simply expect a long wait, followed by possibly an OOM.