Spark: Draw learning curve of a model with spark - scala

I am using Spark and I would like to train a machine learning model.
Because of bad results, I would like to display the error made by the model at each epoch of the training (on train and test dataset).
I will then use this information to determined if my model is underfitting or overfitting the data.
Question: How can I draw the learning curve of a model with spark ?
In the following example, I have implement my own evaluator and override the evaluate method to print the metrics I was needed, but only two values have been display (maxIter = 1000).
MinimalRunnableCode.scala:
import org.apache.spark.SparkConf
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.sql.SparkSession
object Min extends App {
// Open spark session.
val conf = new SparkConf()
.setMaster("local")
.set("spark.network.timeout", "800")
val ss = SparkSession.builder
.config(conf)
.getOrCreate
// Load data.
val data = ss.createDataFrame(ss.sparkContext.parallelize(
List(
(Vectors.dense(1, 2), 1),
(Vectors.dense(1, 3), 2),
(Vectors.dense(1, 2), 1),
(Vectors.dense(1, 3), 2),
(Vectors.dense(1, 2), 1),
(Vectors.dense(1, 3), 2),
(Vectors.dense(1, 2), 1),
(Vectors.dense(1, 3), 2),
(Vectors.dense(1, 2), 1),
(Vectors.dense(1, 3), 2),
(Vectors.dense(1, 4), 3)
)
))
.withColumnRenamed("_1", "features")
.withColumnRenamed("_2", "label")
val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 42)
// Create model of linear regression.
val lr = new LinearRegression().setMaxIter(1000)
// Create parameters grid that will be used to train different version of the linear model.
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.001))
.addGrid(lr.fitIntercept)
.addGrid(lr.elasticNetParam, Array(0.5))
.build()
// Create trainer using validation split to evaluate which set of parameters performs the best.
val trainValidationSplit = new TrainValidationSplit()
.setEstimator(lr)
.setEvaluator(new CustomRegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.8) // 80% of the data will be used for training and the remaining 20% for validation.
// Run train validation split, and choose the best set of parameters.
var model = trainValidationSplit.fit(training)
// Close spark session.
ss.stop()
}
CustomRegressionEvaluator.scala:
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.param.{Param, ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
final class CustomRegressionEvaluator (override val uid: String) extends Evaluator with HasPredictionCol with HasLabelCol with DefaultParamsWritable {
def this() = this(Identifiable.randomUID("regEval"))
def checkNumericType(
schema: StructType,
colName: String,
msg: String = ""): Unit = {
val actualDataType = schema(colName).dataType
val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
require(actualDataType.isInstanceOf[NumericType], s"Column $colName must be of type " +
s"NumericType but was actually of type $actualDataType.$message")
}
def checkColumnTypes(
schema: StructType,
colName: String,
dataTypes: Seq[DataType],
msg: String = ""): Unit = {
val actualDataType = schema(colName).dataType
val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
require(dataTypes.exists(actualDataType.equals),
s"Column $colName must be of type equal to one of the following types: " +
s"${dataTypes.mkString("[", ", ", "]")} but was actually of type $actualDataType.$message")
}
var i = 0 // count the number of time the evaluate method is called
override def evaluate(dataset: Dataset[_]): Double = {
val schema = dataset.schema
checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
checkNumericType(schema, $(labelCol))
val predictionAndLabels = dataset
.select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
.rdd
.map { case Row(prediction: Double, label: Double) => (prediction, label) }
val metrics = new RegressionMetrics(predictionAndLabels)
val metric = "mae" match {
case "rmse" => metrics.rootMeanSquaredError
case "mse" => metrics.meanSquaredError
case "r2" => metrics.r2
case "mae" => metrics.meanAbsoluteError
}
println(s"$i $metric") // Print the metrics
i = i + 1 // Update counter
metric
}
override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {
override def load(path: String): RegressionEvaluator = super.load(path)
}
private[ml] trait HasPredictionCol extends Params {
/**
* Param for prediction column name.
* #group param
*/
final val predictionCol: Param[String] = new Param[String](this, "predictionCol", "prediction column name")
setDefault(predictionCol, "prediction")
/** #group getParam */
final def getPredictionCol: String = $(predictionCol)
}
private[ml] trait HasLabelCol extends Params {
/**
* Param for label column name.
* #group param
*/
final val labelCol: Param[String] = new Param[String](this, "labelCol", "label column name")
setDefault(labelCol, "label")
/** #group getParam */
final def getLabelCol: String = $(labelCol)
}

Here is a possible solution for the specific case of LinearRegression and any other algorithm that support objective history (in this case, And LinearRegressionTrainingSummary does the job).
Let's first create a minimal verifiable and complete example :
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLUtils}
import org.apache.spark.sql.SparkSession
val spark: SparkSession = SparkSession.builder().getOrCreate()
import org.apache.spark.ml.evaluation.RegressionEvaluator
import spark.implicits._
val data = {
val tmp = LinearDataGenerator.generateLinearRDD(
spark.sparkContext,
nexamples = 10000,
nfeatures = 4,
eps = 0.05
).toDF
MLUtils.convertVectorColumnsToML(tmp, "features")
}
As you've noticed, when you want to generate data for testing purposes for spark-mllib or spark-ml, it's advised to use data generators.
Now, let's train a linear regressor :
// Create model of linear regression.
val lr = new LinearRegression().setMaxIter(1000)
// The following line will create two sets of parameters
val paramGrid = new ParamGridBuilder().addGrid(lr.regParam, Array(0.001)).addGrid(lr.fitIntercept).addGrid(lr.elasticNetParam, Array(0.5)).build()
// Create trainer using validation split to evaluate which set of parameters performs the best.
// I'm using the regular RegressionEvaluator here
val trainValidationSplit = new TrainValidationSplit()
.setEstimator(lr)
.setEvaluator(new RegressionEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.8) // 80% of the data will be used for training and the remaining 20% for validation.
// To retrieve subModels, make sure to set collectSubModels to true before fitting.
trainValidationSplit.setCollectSubModels(true)
// Run train validation split, and choose the best set of parameters.
var model = trainValidationSplit.fit(data)
Now since our model is trained, all we need is to get the objective history.
The following part needs a bit of gymnastics between the model and sub-models object parameters.
In case you have a Pipeline or so, this code needs to be modified, so use it carefully. It's just an example :
val objectiveHist = spark.sparkContext.parallelize(
model.subModels.zip(model.getEstimatorParamMaps).map {
case (m: LinearRegressionModel, pm: ParamMap) =>
val history: Array[Double] = m.summary.objectiveHistory
val idx: Seq[Int] = 1 until history.length
// regParam, elasticNetParam, fitIntercept
val parameters = pm.toSeq.map(pair => (pair.param.name, pair.value.toString)) match {
case Seq(x, y, z) => (x._2, y._2, z._2)
}
(parameters._1, parameters._2, parameters._3, idx.zip(history).toMap)
}).toDF("regParam", "elasticNetParam", "fitIntercept", "objectiveHistory")
We can now examine those metrics :
objectiveHist.show(false)
// +--------+---------------+------------+-------------------------------------------------------------------------------------------------------+
// |regParam|elasticNetParam|fitIntercept|objectiveHistory |
// +--------+---------------+------------+-------------------------------------------------------------------------------------------------------+
// |0.001 |0.5 |true |[1 -> 0.4999999999999999, 2 -> 0.4038796441909531, 3 -> 0.02659222058006269, 4 -> 0.026592220340980147]|
// |0.001 |0.5 |false |[1 -> 0.5000637621421942, 2 -> 0.4039303922115196, 3 -> 0.026592220673025396, 4 -> 0.02659222039347222]|
// +--------+---------------+------------+-------------------------------------------------------------------------------------------------------+
You can notice that the training process actually stops after 4 iterations.
If you want just the number of iterations, you can do the following instead :
val objectiveHist2 = spark.sparkContext.parallelize(
model.subModels.zip(model.getEstimatorParamMaps).map {
case (m: LinearRegressionModel, pm: ParamMap) =>
val history: Array[Double] = m.summary.objectiveHistory
// regParam, elasticNetParam, fitIntercept
val parameters = pm.toSeq.map(pair => (pair.param.name, pair.value.toString)) match {
case Seq(x, y, z) => (x._2, y._2, z._2)
}
(parameters._1, parameters._2, parameters._3, history.size)
}).toDF("regParam", "elasticNetParam", "fitIntercept", "iterations")
I've changed the number of features in the generator (nfeatures = 100) for the sake of demonstrations :
objectiveHist2.show
// +--------+---------------+------------+----------+
// |regParam|elasticNetParam|fitIntercept|iterations|
// +--------+---------------+------------+----------+
// | 0.001| 0.5| true| 11|
// | 0.001| 0.5| false| 11|
// +--------+---------------+------------+----------+

Related

Spark ML insert/fit custom OneHotEncoder into a Pipeline

Say I have a few features/columns in a dataframe on which I apply the regular OneHotEncoder, and one (let, n-th) column on which I need to apply my custom OneHotEncoder. Then I need to use VectorAssembler to assemble those features, and put into a Pipeline, finally fitting my trainData and getting predictions from my testData, such as:
val sIndexer1 = new StringIndexer().setInputCol("my_feature1").setOutputCol("indexed_feature1")
// ... let, n-1 such sIndexers for n-1 features
val featureEncoder = new OneHotEncoderEstimator().setInputCols(Array(sIndexer1.getOutputCol), ...).
setOutputCols(Array("encoded_feature1", ... ))
// **need to insert output from my custom OneHotEncoder function (please see below)**
// (which takes the n-th feature as input) in a way that matches the VectorAssembler below
val vectorAssembler = new VectorAssembler().setInputCols(featureEncoder.getOutputCols + ???).
setOutputCol("assembled_features")
...
val pipeline = new Pipeline().setStages(Array(sIndexer1, ...,featureEncoder, vectorAssembler, myClassifier))
val model = pipeline.fit(trainData)
val predictions = model.transform(testData)
How can I modify the building of the vectorAssembler so that it can ingest the output from the custom OneHotEncoder?
The problem is my desired oheEncodingTopN() cannot/should not refer to the "actual" dataframe, since it would be a part of the pipeline (to apply on trainData/testData).
Note:
I tested that the custom OneHotEncoder (see link) works just as expected separately on e.g. trainData. Basically, oheEncodingTopN applies OneHotEncoding on the input column, but for the top N frequent values only (e.g. N = 50), and put all the rest infrequent values in a dummy column (say, "default"), e.g.:
val oheEncoded = oheEncodingTopN(df, "my_featureN", 50)
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, lit, when}
import org.apache.spark.sql.Column
def flip(col: Column): Column = when(col === 1, lit(0)).otherwise(lit(1))
def oheEncodingTopN(df: DataFrame, colName: String, n: Int): DataFrame = {
df.createOrReplaceTempView("data")
val topNDF = spark.sql(s"select $colName, count(*) as count from data group by $colName order by count desc limit $n")
val pivotTopNDF = topNDF.
groupBy(colName).
pivot(colName).
count().
withColumn("default", lit(1))
val joinedTopNDF = df.join(pivotTopNDF, Seq(colName), "left").drop(colName)
val oheEncodedDF = joinedTopNDF.
na.fill(0, joinedTopNDF.columns).
withColumn("default", flip(col("default")))
oheEncodedDF
}
I think the cleanest way would be to create your own class that extends spark ML Transformer so that you can play with as you would do with any other transformer (like OneHotEncoder). Your class would look like this :
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, Column}
class OHEncodingTopN(n :Int, override val uid: String) extends Transformer {
final val inputCol= new Param[String](this, "inputCol", "The input column")
final val outputCol = new Param[String](this, "outputCol", "The output column")
; def setInputCol(value: String): this.type = set(inputCol, value)
def setOutputCol(value: String): this.type = set(outputCol, value)
def this(n :Int) = this(n, Identifiable.randomUID("OHEncodingTopN"))
def copy(extra: ParamMap): OHEncodingTopN = {
defaultCopy(extra)
}
override def transformSchema(schema: StructType): StructType = {
// Check that the input type is what you want if needed
// val idx = schema.fieldIndex($(inputCol))
// val field = schema.fields(idx)
// if (field.dataType != StringType) {
// throw new Exception(s"Input type ${field.dataType} did not match input type StringType")
// }
// Add the return field
schema.add(StructField($(outputCol), IntegerType, false))
}
def flip(col: Column): Column = when(col === 1, lit(0)).otherwise(lit(1))
def transform(df: Dataset[_]): DataFrame = {
df.createOrReplaceTempView("data")
val colName = $(inputCol)
val topNDF = df.sparkSession.sql(s"select $colName, count(*) as count from data group by $colName order by count desc limit $n")
val pivotTopNDF = topNDF.
groupBy(colName).
pivot(colName).
count().
withColumn("default", lit(1))
val joinedTopNDF = df.join(pivotTopNDF, Seq(colName), "left").drop(colName)
val oheEncodedDF = joinedTopNDF.
na.fill(0, joinedTopNDF.columns).
withColumn("default", flip(col("default")))
oheEncodedDF
}
}
Now on a OHEncodingTopN object you should be able to call .getOuputCol to perform what you want. Good luck.
EDIT: your method that I just copy pasted in the transform method should be slightly modified in order to output a column of type Vector having the name given in the setOutputCol.

Dataset data is updated after inserting into Mysql Database

I have a small scenario where i read text file and calculate average based on date and store the summary into Mysql database.
Following is code
val repo_sum = joined_data.map(SensorReport.generateReport)
repo_sum.show() --- STEP 1
repo_sum.write.mode(SaveMode.Overwrite).jdbc(url, "sensor_report", prop)
repo_sum.show() --- STEP 2
After calculating average in repo_sum dataframe following is the result of STEP 1
+----------+------------------+-----+-----+
| date| flo| hz|count|
+----------+------------------+-----+-----+
|2017-10-05|52.887049194476745|10.27| 5.0|
|2017-10-04| 55.4188048943416|10.27| 5.0|
|2017-10-03| 54.1529270444092|10.27| 10.0|
+----------+------------------+-----+-----+
Then the save command is executed and the dataset values at step 2 is
+----------+-----------------+------------------+-----+
| date| flo| hz|count|
+----------+-----------------+------------------+-----+
|2017-10-05|52.88704919447673|31.578524597238367| 10.0|
|2017-10-04| 55.4188048943416| 32.84440244717079| 10.0|
+----------+-----------------+------------------+-----+
Following is complete code
class StreamRead extends Serializable {
org.apache.spark.sql.catalyst.encoders.OuterScopes.addOuterScope(this);
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Application").setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(2))
val sqlContext = new SQLContext(ssc.sparkContext)
import sqlContext.implicits._
val sensorDStream = ssc.textFileStream("file:///C:/Users/M1026352/Desktop/Spark/StreamData").map(Sensor.parseSensor)
val url = "jdbc:mysql://localhost:3306/streamdata"
val prop = new java.util.Properties
prop.setProperty("user", "root")
prop.setProperty("password", "root")
val tweets = sensorDStream.foreachRDD {
rdd =>
if (rdd.count() != 0) {
val databaseVal = sqlContext.read.jdbc("jdbc:mysql://localhost:3306/streamdata", "sensor_report", prop)
val rdd_group = rdd.groupBy { x => x.date }
val repo_data = rdd_group.map { x =>
val sum_flo = x._2.map { x => x.flo }.reduce(_ + _)
val sum_hz = x._2.map { x => x.hz }.reduce(_ + _)
val sum_flo_count = x._2.size
print(sum_flo_count)
SensorReport(x._1, sum_flo, sum_hz, sum_flo_count)
}
val df = repo_data.toDF()
val joined_data = df.join(databaseVal, Seq("date"), "fullouter")
joined_data.show()
val repo_sum = joined_data.map(SensorReport.generateReport)
repo_sum.show()
repo_sum.write.mode(SaveMode.Overwrite).jdbc(url, "sensor_report", prop)
repo_sum.show()
}
}
ssc.start()
WorkerAndTaskExample.main(args)
ssc.awaitTermination()
}
case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double)
object Sensor extends Serializable {
def parseSensor(str: String): Sensor = {
val p = str.split(",")
Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble)
}
}
case class SensorReport(date: String, flo: Double, hz: Double, count: Double)
object SensorReport extends Serializable {
def generateReport(row: Row): SensorReport = {
print(row)
if (row.get(4) == null) {
SensorReport(row.getString(0), row.getDouble(1) / row.getDouble(3), row.getDouble(2) / row.getDouble(3), row.getDouble(3))
} else if (row.get(2) == null) {
SensorReport(row.getString(0), row.getDouble(4), row.getDouble(5), row.getDouble(6))
} else {
val count = row.getDouble(3) + row.getDouble(6)
val flow_avg_update = (row.getDouble(6) * row.getDouble(4) + row.getDouble(1)) / count
val flow_flo_update = (row.getDouble(6) * row.getDouble(5) + row.getDouble(1)) / count
print(count + " : " + flow_avg_update + " : " + flow_flo_update)
SensorReport(row.getString(0), flow_avg_update, flow_flo_update, count)
}
}
}
As far as i understand when save command is executed in spark the whole process runs again, is my understanding is correct please let me know.
In Spark all transformations are lazy, nothing will happen until an action is called. At the same time, this means that if multiple actions are called on the same RDD or dataframe, all computations will be performed multiple times. This includes loading the data and all transformations.
To avoid this, use cache() or persist() (same thing except that cache() can specify different types of storage, the default is RAM memory only). cache() will keep the RDD/dataframe in memory after the first time an action was used on it. Hence, avoiding running the same transformations multiple times.
In this case, since two actions are performed on the dataframe is causing this unexpected behavior, caching the dataframe would solve the problem:
val repo_sum = joined_data.map(SensorReport.generateReport).cache()

spark map partitions to fill nan values

I want to fill nan values in spark using the last good known observation - see: Spark / Scala: fill nan with last good observation
My current solution used window functions in order to accomplish the task. But this is not great, as all values are mapped into a single partition.
val imputed: RDD[FooBar] = recordsDF.rdd.mapPartitionsWithIndex { case (i, iter) => fill(i, iter) } should work a lot better. But strangely my fill function is not executed. What is wrong with my code?
+----------+--------------------+
| foo| bar|
+----------+--------------------+
|2016-01-01| first|
|2016-01-02| second|
| null| noValidFormat|
|2016-01-04|lastAssumingSameDate|
+----------+--------------------+
Here is the full example code:
import java.sql.Date
import org.apache.log4j.{ Level, Logger }
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
case class FooBar(foo: Date, bar: String)
object WindowFunctionExample extends App {
Logger.getLogger("org").setLevel(Level.WARN)
val conf: SparkConf = new SparkConf()
.setAppName("foo")
.setMaster("local[*]")
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
val myDff = Seq(("2016-01-01", "first"), ("2016-01-02", "second"),
("2016-wrongFormat", "noValidFormat"),
("2016-01-04", "lastAssumingSameDate"))
val recordsDF = myDff
.toDF("foo", "bar")
.withColumn("foo", 'foo.cast("Date"))
.as[FooBar]
recordsDF.show
def notMissing(row: FooBar): Boolean = {
row.foo != null
}
val toCarry = recordsDF.rdd.mapPartitionsWithIndex { case (i, iter) => Iterator((i, iter.filter(notMissing(_)).toSeq.lastOption)) }.collectAsMap
println("###################### carry ")
println(toCarry)
println(toCarry.foreach(println))
println("###################### carry ")
val toCarryBd = spark.sparkContext.broadcast(toCarry)
def fill(i: Int, iter: Iterator[FooBar]): Iterator[FooBar] = {
var lastNotNullRow: FooBar = toCarryBd.value(i).get
iter.map(row => {
if (!notMissing(row))1
FooBar(lastNotNullRow.foo, row.bar)
else {
lastNotNullRow = row
row
}
})
}
// The algorithm does not step into the for loop for filling the null values. Strange
val imputed: RDD[FooBar] = recordsDF.rdd.mapPartitionsWithIndex { case (i, iter) => fill(i, iter) }
val imputedDF = imputed.toDS()
println(imputedDF.orderBy($"foo").collect.toList)
imputedDF.show
spark.stop
}
edit
I fixed the code as outlined by the comment. But the toCarryBd contains None values. How can this happen as I did filter explicitly for
def notMissing(row: FooBar): Boolean = {row.foo != null}
iter.filter(notMissing(_)).toSeq.lastOption
non None values.
(2,None)
(5,None)
(4,None)
(7,Some(FooBar(2016-01-04,lastAssumingSameDate)))
(1,Some(FooBar(2016-01-01,first)))
(3,Some(FooBar(2016-01-02,second)))
(6,None)
(0,None)
This leads to NoSuchElementException: None.getwhen trying to access toCarryBd.
Firstly, if your foo field can be null, I would recommend creating the case class as:
case class FooBar(foo: Option[Date], bar: String)
Then, you can rewrite your notMissing function to something like:
def notMissing(row: Option[FooBar]): Boolean = row.isDefined && row.get.foo.isDefined

Spark Streaming 2.0.1 - Datasets and RDDs

I am a beginner for spark streaming and trying with streaming linear regression example using scala. So when I searched I have found lots of examples for streaming machine learning algorithms using RDDs. But isn't it possible to use datasets (introduced in spark 2.0.1) for streaming instead of RDDs. Is there any way to verify whether the code is using RDDs or Datasets?I have posted my code below. Any help appreciated.
import scala.language.reflectiveCalls
import scopt.OptionParser
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.{ DataFrame, SparkSession }
import com.sun.xml.internal.ws.wsdl.writer.document.Import
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.lang.Boolean
object LinearRegressionExample {
case class Params(
input: String = null,
testInput: String = "",
dataFormat: String = "libsvm",
regParam: Double = 0.0,
elasticNetParam: Double = 0.0,
maxIter: Int = 100,
tol: Double = 1E-6,
fracTest: Double = 0.2) extends AbstractParams[Params]
def main(args: Array[String]) {
val defaultParams = Params()
val parser = new OptionParser[Params]("LinearRegressionExample") {
head("LinearRegressionExample: an example Linear Regression with Elastic-Net app.")
opt[Double]("regParam")
.text(s"regularization parameter, default: ${defaultParams.regParam}")
.action((x, c) => c.copy(regParam = x))
opt[Double]("elasticNetParam")
.text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
s"L1 and L2, default: ${defaultParams.elasticNetParam}")
.action((x, c) => c.copy(elasticNetParam = x))
opt[Int]("maxIter")
.text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
.action((x, c) => c.copy(maxIter = x))
opt[Double]("tol")
.text(s"the convergence tolerance of iterations, Smaller value will lead " +
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
.action((x, c) => c.copy(tol = x))
opt[Double]("fracTest")
.text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[String]("testInput")
.text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
.text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
.action((x, c) => c.copy(dataFormat = x))
arg[String]("<input>")
.text("input path to labeled examples")
.required()
.action((x, c) => c.copy(input = x))
}
parser.parse(args, defaultParams) match {
case Some(params) => run(params)
case _ => sys.exit(1)
}
}
def run(params: Params): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("LinearRegressionExample with $params")
val ssc = new StreamingContext(conf, Seconds(1))
val spark = SparkSession
.builder
.appName(s"LinearRegressionExample with $params")
.getOrCreate()
println(s"LinearRegressionExample with parameters:\n$params")
// Load training and test data and cache it.
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
params.dataFormat, params.testInput, "regression", params.fracTest)
val lir = new LinearRegression()
.setFeaturesCol("features")
.setLabelCol("label")
.setRegParam(params.regParam)
.setElasticNetParam(params.elasticNetParam)
.setMaxIter(params.maxIter)
.setTol(params.tol)
// Train the model
val startTime = System.nanoTime()
val lirModel = lir.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
// Print the weights and intercept for linear regression.
println(s"Weights: ${lirModel.coefficients} Intercept: ${lirModel.intercept}")
println("Training data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, training, "label")
println("Test data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label")
spark.stop()
}
}

SparkContext cannot be launched in the same programe with Streaming SparkContext

I created the following test that fit a simple linear regression model to a dummy streaming data.
I use hyper-parameters optimisation to find good values of stepSize, numiterations and initialWeights of the linear model.
Everything runs fine, except the last lines of the code that are commented out:
// Save the evaluations for further visualization
// val gridEvalsRDD = sc.parallelize(gridEvals)
// gridEvalsRDD.coalesce(1)
// .map(e => "%.3f\t%.3f\t%d\t%.3f".format(e._1, e._2, e._3, e._4))
// .saveAsTextFile("data/mllib/streaming")
The problem is with the SparkContext sc. If I initialize it at the beginning of a test, then the program shown errors. It looks like sc should be defined in some special way in order to avoid conflicts with scc (streaming spark context). Any ideas?
The whole code:
// scalastyle:off
package org.apache.spark.mllib.regression
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
import org.apache.spark.streaming.TestSuiteBase
import org.scalatest.BeforeAndAfter
class StreamingLinearRegressionHypeOpt extends TestSuiteBase with BeforeAndAfter {
// use longer wait time to ensure job completion
override def maxWaitTimeMillis: Int = 20000
var ssc: StreamingContext = _
override def afterFunction() {
super.afterFunction()
if (ssc != null) {
ssc.stop()
}
}
def calculateMSE(output: Seq[Seq[(Double, Double)]], n: Int): Double = {
val mse = output
.map {
case seqOfPairs: Seq[(Double, Double)] =>
val err = seqOfPairs.map(p => math.abs(p._1 - p._2)).sum
err*err
}.sum / n
mse
}
def calculateRMSE(output: Seq[Seq[(Double, Double)]], n: Int): Double = {
val mse = output
.map {
case seqOfPairs: Seq[(Double, Double)] =>
val err = seqOfPairs.map(p => math.abs(p._1 - p._2)).sum
err*err
}.sum / n
math.sqrt(mse)
}
def dummyStringStreamSplit(datastream: Stream[String]) =
datastream.flatMap(txt => txt.split(" "))
test("Test 1") {
// create model initialized with zero weights
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(0.0, 0.0))
.setStepSize(0.2)
.setNumIterations(25)
// generate sequence of simulated data for testing
val numBatches = 10
val nPoints = 100
val inputData = (0 until numBatches).map { i =>
LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), nPoints, 42 * (i + 1))
}
// Without hyper-parameters optimization
withStreamingContext(setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
model.trainOn(inputDStream)
model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})) { ssc =>
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val rmse = calculateRMSE(output, nPoints)
println(s"RMSE = $rmse")
}
// With hyper-parameters optimization
val gridParams = Map(
"initialWeights" -> List(Vectors.dense(0.0, 0.0), Vectors.dense(10.0, 10.0)),
"stepSize" -> List(0.1, 0.2, 0.3),
"numIterations" -> List(25, 50)
)
val gridEvals = for (initialWeights <- gridParams("initialWeights");
stepSize <- gridParams("stepSize");
numIterations <- gridParams("numIterations")) yield {
val lr = new StreamingLinearRegressionWithSGD()
.setInitialWeights(initialWeights.asInstanceOf[Vector])
.setStepSize(stepSize.asInstanceOf[Double])
.setNumIterations(numIterations.asInstanceOf[Int])
withStreamingContext(setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
lr.trainOn(inputDStream)
lr.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})) { ssc =>
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val cvRMSE = calculateRMSE(output, nPoints)
println(s"RMSE = $cvRMSE")
(initialWeights, stepSize, numIterations, cvRMSE)
}
}
// Save the evaluations for further visualization
// val gridEvalsRDD = sc.parallelize(gridEvals)
// gridEvalsRDD.coalesce(1)
// .map(e => "%.3f\t%.3f\t%d\t%.3f".format(e._1, e._2, e._3, e._4))
// .saveAsTextFile("data/mllib/streaming")
}
}
// scalastyle:on