optimizing shuffle spark outer join dataset - scala

I am using Spark 2.1 with DataFrames API to do :
import org.apache.spark.sql.Encoders
import java.security.MessageDigest
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.sql.{Dataset, Encoders, SaveMode, SparkSession}
case class C(id_1: String, id_2: String, a: Option[Int], b: String)
val schema = Encoders.product[C]
val data1 = Seq(
("d1_r0", "d1_t0", 1, "yyy"),
("d1_r1", "d1_t1", 2, "xxx"),
("d2_r2", "d2_t2", 3, "ppp"),
("d1_r3", "d1_t3", 4, "iii")
)
val df1 = data1.toDF("id_1", "id_2", "a", "b")
val ds1: Dataset[C] = df1.as(schema)
val data2 = Seq(
("d2_r0", "d2_t0", 1, "lll"),
("d1_r1", "d1_t1", 2, "mmm"),
("d2_r2", "d2_t2", 3, "ppp"),
("d2_r3", "d2_t3", 4, "nnn")
)
val df2 = data2.toDF("id_1", "id_2", "a", "b")
val ds2: Dataset[C] = df2.as(schema)
def getMD5Hash(x: C): String = {
val str = (x.id_1 + x.id_2 + x.a + x.b)
val msgDigest: MessageDigest = MessageDigest.getInstance("MD5")
val MD5Hash = msgDigest
.digest(str.getBytes())
.map(0xff & _)
.map { "%02x".format(_) }
.foldLeft("") { _ + _ }
MD5Hash
}
def u(newV: C, oldV: C): Seq[C] = {
Seq(C(oldV.id_1, oldV.id_2, oldV.a, newV.b))
}
def uOrI(b: String)(row: (C, C)): Seq[C] = {
row match {
case (newV, null) => Seq(newV)
case (null, oldV) => Seq(C(oldV.id_1, oldV.id_2, oldV.a, b))
case (newV, oldV) => {
if (getMD5Hash(newV) == getMD5Hash(oldV)) Seq(oldV)
else u(newV, oldV)
}
}
}
val df3 = ds1
.joinWith(
ds2,
$"_1.id_1" === $"_2.id_1" && $"_1.id_2" === $"_2.id_2","full_outer"
).flatMap(uOrI("jjjjjjjj"))
the program wroks and produce what I expectde , but in real dataset (over than 1 milion rows for df1 and df2) the solution is very slow , 30 min to complete in a cluster yarn with 10 nodes (16 cpu 128G ram each).
there is another solution/idea to do that for optimize shuffle and time ?

Related

Update to the delta table in spark not working

package jobs
import io.delta.tables.DeltaTable
import model.RandomUtils
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }
import org.apache.spark.sql.{ DataFrame, Dataset, Encoder, Encoders, SparkSession }
import jobs.SystemJob.Rate
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
case class Student(firstName: String, lastName: String, age: Long, percentage: Long)
case class Rate(timestamp: Timestamp, value: Long)
case class College(name: String, address: String, principal: String)
object RCConfigDSCCDeltaLake {
def getSpark(): SparkSession = {
SparkSession.builder
.appName("Delta table demo")
.master("local[*]")
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.getOrCreate()
}
def main(args: Array[String]): Unit = {
val spark = getSpark()
val rate = 1
val studentProfile = "student_profile"
if (!DeltaTable.isDeltaTable(s"spark-warehouse/$studentProfile")) {
val deltaTable: DataFrame = spark.sql(s"CREATE TABLE `$studentProfile` (firstName String, lastName String, age Long, percentage Long) USING delta")
deltaTable.show()
deltaTable.printSchema()
}
val studentProfileDT = DeltaTable.forPath(spark, s"spark-warehouse/$studentProfile")
def processStream(student: Dataset[Student], college: Dataset[College]) = {
val studentQuery = student.writeStream.outputMode(OutputMode.Update()).foreachBatch {
(st: Dataset[Student], y: Long) =>
val listOfStudents = st.collect().toList
println("list of students :::" + listOfStudents)
val (o, n) = ("oldData", "newData")
val colMap = Map(
"firstName" -> col(s"$n.firstName"),
"lastName" -> col(s"$n.lastName"),
"age" -> col(s"$n.age"),
"percentage" -> col(s"$n.percentage"))
studentProfileDT.as(s"$o").merge(st.toDF.as(s"$n"), s"$o.firstName = $n.firstName AND $o.lastName = $n.lastName")
.whenMatched.update(colMap)
.whenNotMatched.insert(colMap)
.execute()
}.start()
val os = spark.readStream.format("delta").load(s"spark-warehouse/$studentProfile").writeStream.format("console")
.outputMode(OutputMode.Append())
.option("truncate", value = false)
.option("checkpointLocation", "retrieved").start()
studentQuery.awaitTermination()
os.awaitTermination()
}
import spark.implicits._
implicit val encStudent: Encoder[Student] = Encoders.product[Student]
implicit val encCollege: Encoder[College] = Encoders.product[College]
def rateStream = spark
.readStream
.format("rate") // <-- use RateStreamSource
.option("rowsPerSecond", rate)
.load()
.as[Rate]
val studentStream: Dataset[Student] = rateStream.filter(_.value % 25 == 0).map {
stu =>
Student(...., ....., ....., .....) //fill with values
}
val collegeStream: Dataset[College] = rateStream.filter(_.value % 40 == 0).map {
stu =>
College(...., ....., ......) //fill with values
}
processStream(studentStream, collegeStream)
}
}
What I am trying to do is a simple UPSERT operation with streaming datasets. But it fails with error
22/04/13 19:50:33 ERROR MicroBatchExecution: Query [id = 8cf759fd-9bee-460f-
b0d9-91889c59c524, runId = 55723708-fd3c-4425-a2bc-83d737c37589] terminated with
error
java.lang.UnsupportedOperationException: Detected a data update (for example part-
00000-d026d92e-1798-4d21-a505-67ec72d334e2-c000.snappy.parquet) in the source table
at version 4. This is currently not supported. If you'd like to ignore updates, set
the option 'ignoreChanges' to 'true'. If you would like the data update to be
reflected, please restart this query with a fresh checkpoint directory.
Dependencies :
--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2
--packages io.delta:delta-core_2.12:0.7.0
The update query works when the datasets are not streamed and only hardcoded.
Am I doing something wrong here ?

Spark collect_list and limit resulting list

I have a dataframe of the following format:
name merged
key1 (internalKey1, value1)
key1 (internalKey2, value2)
...
key2 (internalKey3, value3)
...
What I want to do is group the dataframe by the name, collect the list and limit the size of the list.
This is how i group by the name and collect the list:
val res = df.groupBy("name")
.agg(collect_list(col("merged")).as("final"))
The resuling dataframe is something like:
key1 [(internalKey1, value1), (internalKey2, value2),...] // Limit the size of this list
key2 [(internalKey3, value3),...]
What I want to do is limit the size of the produced lists for each key. I' ve tried multiple ways to do that but had no success. I've already seen some posts that suggest 3rd party solutions but I want to avoid that. Is there a way?
So while a UDF does what you need, if you're looking for a more performant way that is also memory sensitive, the way of doing this would be to write a UDAF. Unfortunately the UDAF API is actually not as extensible as the aggregate functions that ship with spark. However you can use their internal APIs to build on the internal functions to do what you need.
Here is an implementation for collect_list_limit that is mostly a copy past of Spark's internal CollectList AggregateFunction. I would just extend it but its a case class. Really all that's needed is to override update and merge methods to respect a passed in limit:
case class CollectListLimit(
child: Expression,
limitExp: Expression,
mutableAggBufferOffset: Int = 0,
inputAggBufferOffset: Int = 0) extends Collect[mutable.ArrayBuffer[Any]] {
val limit = limitExp.eval( null ).asInstanceOf[Int]
def this(child: Expression, limit: Expression) = this(child, limit, 0, 0)
override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
copy(mutableAggBufferOffset = newMutableAggBufferOffset)
override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
copy(inputAggBufferOffset = newInputAggBufferOffset)
override def createAggregationBuffer(): mutable.ArrayBuffer[Any] = mutable.ArrayBuffer.empty
override def update(buffer: mutable.ArrayBuffer[Any], input: InternalRow): mutable.ArrayBuffer[Any] = {
if( buffer.size < limit ) super.update(buffer, input)
else buffer
}
override def merge(buffer: mutable.ArrayBuffer[Any], other: mutable.ArrayBuffer[Any]): mutable.ArrayBuffer[Any] = {
if( buffer.size >= limit ) buffer
else if( other.size >= limit ) other
else ( buffer ++= other ).take( limit )
}
override def prettyName: String = "collect_list_limit"
}
And to actually register it, we can do it through Spark's internal FunctionRegistry which takes in the name and the builder which is effectively a function that creates a CollectListLimit using the provided expressions:
val collectListBuilder = (args: Seq[Expression]) => CollectListLimit( args( 0 ), args( 1 ) )
FunctionRegistry.builtin.registerFunction( "collect_list_limit", collectListBuilder )
Edit:
Turns out adding it to the builtin only works if you haven't created the SparkContext yet as it makes an immutable clone on startup. If you have an existing context then this should work to add it with reflection:
val field = classOf[SessionCatalog].getFields.find( _.getName.endsWith( "functionRegistry" ) ).get
field.setAccessible( true )
val inUseRegistry = field.get( SparkSession.builder.getOrCreate.sessionState.catalog ).asInstanceOf[FunctionRegistry]
inUseRegistry.registerFunction( "collect_list_limit", collectListBuilder )
You can create a function that limits the size of the aggregated ArrayType column as shown below:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Column
case class KV(k: String, v: String)
val df = Seq(
("key1", KV("internalKey1", "value1")),
("key1", KV("internalKey2", "value2")),
("key2", KV("internalKey3", "value3")),
("key2", KV("internalKey4", "value4")),
("key2", KV("internalKey5", "value5"))
).toDF("name", "merged")
def limitSize(n: Int, arrCol: Column): Column =
array( (0 until n).map( arrCol.getItem ): _* )
df.
groupBy("name").agg( collect_list(col("merged")).as("final") ).
select( $"name", limitSize(2, $"final").as("final2") ).
show(false)
// +----+----------------------------------------------+
// |name|final2 |
// +----+----------------------------------------------+
// |key1|[[internalKey1,value1], [internalKey2,value2]]|
// |key2|[[internalKey3,value3], [internalKey4,value4]]|
// +----+----------------------------------------------+
You can use a UDF.
Here is a probable example without the necessity of schema and with a meaningful reduction:
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.functions._
import scala.collection.mutable
object TestJob1 {
def main (args: Array[String]): Unit = {
val sparkSession = SparkSession
.builder()
.appName(this.getClass.getName.replace("$", ""))
.master("local")
.getOrCreate()
val sc = sparkSession.sparkContext
import sparkSession.sqlContext.implicits._
val rawDf = Seq(
("key", 1L, "gargamel"),
("key", 4L, "pe_gadol"),
("key", 2L, "zaam"),
("key1", 5L, "naval")
).toDF("group", "quality", "other")
rawDf.show(false)
rawDf.printSchema
val rawSchema = rawDf.schema
val fUdf = udf(reduceByQuality, rawSchema)
val aggDf = rawDf
.groupBy("group")
.agg(
count(struct("*")).as("num_reads"),
max(col("quality")).as("quality"),
collect_list(struct("*")).as("horizontal")
)
.withColumn("short", fUdf($"horizontal"))
.drop("horizontal")
aggDf.printSchema
aggDf.show(false)
}
def reduceByQuality= (x: Any) => {
val d = x.asInstanceOf[mutable.WrappedArray[GenericRowWithSchema]]
val red = d.reduce((r1, r2) => {
val quality1 = r1.getAs[Long]("quality")
val quality2 = r2.getAs[Long]("quality")
val r3 = quality1 match {
case a if a >= quality2 =>
r1
case _ =>
r2
}
r3
})
red
}
}
here is an example with data like yours
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import scala.collection.mutable
object TestJob {
def main (args: Array[String]): Unit = {
val sparkSession = SparkSession
.builder()
.appName(this.getClass.getName.replace("$", ""))
.master("local")
.getOrCreate()
val sc = sparkSession.sparkContext
import sparkSession.sqlContext.implicits._
val df1 = Seq(
("key1", ("internalKey1", "value1")),
("key1", ("internalKey2", "value2")),
("key2", ("internalKey3", "value3")),
("key2", ("internalKey4", "value4")),
("key2", ("internalKey5", "value5"))
)
.toDF("name", "merged")
// df1.printSchema
//
// df1.show(false)
val res = df1
.groupBy("name")
.agg( collect_list(col("merged")).as("final") )
res.printSchema
res.show(false)
def f= (x: Any) => {
val d = x.asInstanceOf[mutable.WrappedArray[GenericRowWithSchema]]
val d1 = d.asInstanceOf[mutable.WrappedArray[GenericRowWithSchema]].head
d1.toString
}
val fUdf = udf(f, StringType)
val d2 = res
.withColumn("d", fUdf(col("final")))
.drop("final")
d2.printSchema()
d2
.show(false)
}
}

Generating recommendation model for a large dataset using spark

I'm trying out to generate a simple ALS model using the spark documentation here.
My first file(ratings.csv) has 20million UserID,MovID,Rat and can be downloaded here
So I have the testing data which is a subset of ratings.csv. That test dataset can be downloaded here:
The test file has just the UserID, Movie ID column.
So to create training data we will have to filter ratings.csv.
The following code is working fine for a smaller case of 100,000 UserID,MovID rating. I am not able to generate the model for the big case.
Please help with a pointer.
/**
* Created by echoesofconc on 3/8/17.
*/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import java.io._
import scala.collection.mutable.ListBuffer
object Prateek_Agrawal_task1 {
def dropheader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
def create_training(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == false
})
}
def create_testing(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == true
})
}
def create_model(ratings_train:RDD[Array[String]],rank:Int,numIterations:Int ):org.apache.spark.mllib.recommendation.MatrixFactorizationModel={
val ratings = ratings_train.map(_ match { case Array(user,item,rate,temp) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
val model = ALS.train(ratings, rank, numIterations, 0.01)
return model
}
def print_results(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val rating_range=final_predictions_adjusted.map(x=>(x._2.toInt,1)).reduceByKey(_+_).sortByKey()
val rating_range_till_4=rating_range.map{x=>
var temp=x
if (x._1==5){temp=(4,x._2)}
temp
}.reduceByKey(_+_)
rating_range_till_4.sortByKey().foreach { x =>
if(x._1==0)
printf(">=0 and <1: " + x._2+"\n")
if(x._1==1)
printf(">=1 and <2: " + x._2+"\n")
if(x._1==2)
printf(">=2 and <3: " + x._2+"\n")
if(x._1==3)
printf(">=3 and <4: " + x._2+"\n")
if(x._1==4)
printf(">=4 " + x._2+"\n")
if(x._1==5)
printf("=5 " + x._2+"\n")
}
}
case class User_mov_rat(UserID: Int, MovieID:Int, Pred_rating: Double)
def print_outputfile(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val writer = new FileWriter(new File("./output.txt" ))
writer.write("UserID,MovieID,Pred_rating\n")
final_predictions_adjusted.collect().foreach(x=>{writer.write(x._1._1+","+x._1._2+","+x._2+"\n")})
writer.close()
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Prateek_Agrawal_task1").setMaster("local[2]")
val sc = new SparkContext(conf)
val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-20m/ratings.csv"
val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_20m.csv"
val data = sc.textFile(file, 2).cache()
val data_test = sc.textFile(test, 2).cache()
// Drop Header
val data_wo_header=dropheader(data).persist()
val data_test_wo_header=dropheader(data_test).persist()
// Create Training and testing data of the format (User ID, MovID, Rating, Time)
val ratings_split = data_wo_header.map(line => line.split(",")).persist()
data_wo_header.unpersist()
data.unpersist()
val ratings_testing = data_test_wo_header.map(line => line.split(",")).collect()
data_test_wo_header.unpersist()
data_test.unpersist()
val ratings_train = create_training(ratings_split, ratings_testing).persist()
val ratings_test=create_testing(ratings_split, ratings_testing)
ratings_split.unpersist()
ratings_test.unpersist()
// Create the model using rating_train the training data
val rank = 1
val numIterations = 10
val model=create_model(ratings_train,rank,numIterations)
ratings_train.unpersist()
// Average user,Rating from training this is for cases which are there in test but not rated by any user in training
val user_avgrat=ratings_test.map(_ match { case Array(user, mov, rate, temp) =>(user.toInt, (rate.toDouble,1.0))}).reduceByKey((x,y)=>(x._1 + y._1, x._2 + y._2)).mapValues{ case (sum, count) => (1.0 * sum) / count }
// Predict user_mov ratings
val user_mov = data_test_wo_header.map(_.split(',') match { case Array(user, mov) =>
(user.toInt,mov.toInt)
})
val predictions =
model.predict(user_mov).map { case Rating(user, mov, rate) =>
((user, mov), rate)
}
// Combine Predictions and unpredicted user,Movies due to them being individual. Going forward we need to improve the accuracy for these predictions
val user_mov_rat=user_mov.map(x=>(x,0.0))
val predictions_unpredicted_combined= predictions.union(user_mov_rat).reduceByKey(_+_).map(x=>(x._1._1,(x._1._2,x._2)))
// Combine average rating and predictions+unpredicted values
val avg_rating_predictions_unpredicted_combined=predictions_unpredicted_combined.join(user_avgrat)
// Generate final predictions RDD
val final_predictions=avg_rating_predictions_unpredicted_combined.map{x=>
var temp=((x._1,x._2._1._1),x._2._2)
if(x._2._1._2==0.0){temp=((x._1,x._2._1._1),x._2._2)}
if(x._2._1._2!=0.0){temp=((x._1,x._2._1._1),x._2._1._2)}
temp
}
// Adjust for ratings above 5.0 and below 0.0
val final_predictions_adjusted=final_predictions.map{x=>
var temp=x
if (x._2>5.0){temp=(x._1,5.0)}
if (x._2<0.0){temp=(x._1,0.0)}
temp
}
val ratesAndPreds = ratings_test.map(_ match { case Array(user, mov, rate, temp) => ((user.toInt,mov.toInt),rate.toDouble)}).join(final_predictions_adjusted)
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
val RMSE=math.sqrt(MSE)
// Print output.txt
print_outputfile(final_predictions_adjusted)
// Print the predictionresults
print_results(final_predictions_adjusted.sortByKey())
print(RMSE+"\n")
}
}
In case someone thinks I should be doing a regex match I have tried that approach. BUt that dosen't seem to be a bottleneck.
I only need to complete the create model part on which I am stuck for the big dataset. Can somebody help.
EDIT:
Another approach I tried which is much faster by using broadcast variables. But it's been running for 12 hrs with no signs of progress. On spark UI somehow the whole of the RDD(ratings.csv ~500MB) is not cached. Only around 64MB with 2.5 Million lines is being processed initially. I am using --executor-memory -8g. I have modified the create_training create_testing functions:
/**
* Created by echoesofconc on 3/8/17.
*/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import java.io._
object Prateek_Agrawal_task2 {
def dropheader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
def create_training(data_wo_header: RDD[String], data_test_wo_header: RDD[String],sc:SparkContext): RDD[String] = {
val rdd2array = sc.broadcast(data_test_wo_header.collect())
val training_set = data_wo_header.filter{
case(x) => rdd2array.value.filter(y => x.indexOf(y.toString())==0).length == 0
}
return training_set
}
def create_test(data_wo_header: RDD[String], data_test_wo_header: RDD[String],sc:SparkContext): RDD[String] = {
val rdd2array = sc.broadcast(data_test_wo_header.collect())
val training_set = data_wo_header.filter{
case(x) => rdd2array.value.filter(y => x.indexOf(y.toString())==0).length != 0
}
return training_set
}
def create_model(ratings_train:RDD[String],rank:Int,numIterations:Int ):org.apache.spark.mllib.recommendation.MatrixFactorizationModel={
val ratings = ratings_train.map(_.split(',') match { case Array(user, item, rate, timestamp) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
val model = ALS.train(ratings, rank, numIterations, 0.01)
return model
}
def print_results(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val rating_range=final_predictions_adjusted.map(x=>(x._2.toInt,1)).reduceByKey(_+_).sortByKey()
val rating_range_till_4=rating_range.map{x=>
var temp=x
if (x._1==5){temp=(4,x._2)}
temp
}.reduceByKey(_+_)
rating_range_till_4.sortByKey().foreach { x =>
if(x._1==0)
printf(">=0 and <1: " + x._2+"\n")
if(x._1==1)
printf(">=1 and <2: " + x._2+"\n")
if(x._1==2)
printf(">=2 and <3: " + x._2+"\n")
if(x._1==3)
printf(">=3 and <4: " + x._2+"\n")
if(x._1==4)
printf(">=4 " + x._2+"\n")
if(x._1==5)
printf("=5 " + x._2+"\n")
}
}
case class User_mov_rat(UserID: Int, MovieID:Int, Pred_rating: Double)
def print_outputfile(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val writer = new FileWriter(new File("./output.txt" ))
writer.write("UserID,MovieID,Pred_rating\n")
final_predictions_adjusted.collect().foreach(x=>{writer.write(x._1._1+","+x._1._2+","+x._2+"\n")})
writer.close()
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Prateek_Agrawal_task1").setMaster("local[2]")
val sc = new SparkContext(conf)
val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-latest-small/ratings.csv"
val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_small.csv"
// val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-20m/ratings.csv"
// val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_20m.csv"
val data = sc.textFile(file, 2).persist()
val data_test = sc.textFile(test, 2).persist()
// Drop Header
val data_wo_header=dropheader(data)
val data_test_wo_header=dropheader(data_test)
// Create Traing and testing data of the format (User ID, MovID, Rating, Time)
val ratings_train=create_training(data_wo_header,data_test_wo_header,sc).persist()
val ratings_test=create_test(data_wo_header,data_test_wo_header,sc)
// val ratings_test=create_test(data_wo_header,data_test_wo_header,sc)
// data_test_wo_header.unpersist()
// data_test.unpersist()
//// data.unpersist()
//// data_test.unpersist()
// Create the model using rating_train the training data
val rank = 1
val numIterations = 10
val model=create_model(ratings_train,rank,numIterations)
// ratings_train.unpersist()
// model.save(sc, "target/tmp/myCollaborativeFilter")
// val Model = MatrixFactorizationModel.load(sc, "/Users/echoesofconc/myCollaborativeFilter")
// Average user,Rating from training
val user_avgrat=ratings_test.map(_.split(",") match { case Array(user, mov, rate, temp) =>(user.toInt, (rate.toDouble,1.0))}).reduceByKey((x,y)=>(x._1 + y._1, x._2 + y._2)).mapValues{ case (sum, count) => (1.0 * sum) / count }
//data
// Predict user_mov ratings
val user_mov = data_test_wo_header.map(_.split(',') match { case Array(user, mov) =>
(user.toInt,mov.toInt)
})
val predictions =
model.predict(user_mov).map { case Rating(user, mov, rate) =>
((user, mov), rate)
}
// Combine Predictions and unpredicted user,Movies due to them being individual. Going forward we need to improve the accuracy for these predictions
val user_mov_rat=user_mov.map(x=>(x,0.0))
val predictions_unpredicted_combined= predictions.union(user_mov_rat).reduceByKey(_+_).map(x=>(x._1._1,(x._1._2,x._2)))
// Combine average rating and predictions+unpredicted values
val avg_rating_predictions_unpredicted_combined=predictions_unpredicted_combined.join(user_avgrat)
// Generate final predictions RDD
val final_predictions=avg_rating_predictions_unpredicted_combined.map{x=>
var temp=((x._1,x._2._1._1),x._2._2)
if(x._2._1._2==0.0){temp=((x._1,x._2._1._1),x._2._2)}
if(x._2._1._2!=0.0){temp=((x._1,x._2._1._1),x._2._1._2)}
temp
}
// Adjust for ratings above 5.0 and below 0.0
val final_predictions_adjusted=final_predictions.map{x=>
var temp=x
if (x._2>5.0){temp=(x._1,5.0)}
if (x._2<0.0){temp=(x._1,0.0)}
temp
}
val ratesAndPreds = ratings_test.map(_.split(",") match { case Array(user, mov, rate, temp) => ((user.toInt,mov.toInt),rate.toDouble)}).join(final_predictions_adjusted)
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
val RMSE=math.sqrt(MSE)
// Print output.txt
print_outputfile(final_predictions_adjusted)
// Print the predictionresults
print_results(final_predictions_adjusted.sortByKey())
print(RMSE+"\n")
}
}
This worked out to be fine. It's using join to create testng training data
/**
* Created by echoesofconc on 3/8/17.
*/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import java.io._
object Prateek_Agrawal_task1 {
def dropheader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
def create_training(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == false
})
}
def create_testing(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == true
})
}
def create_model(ratings_train:RDD[((String, String), (String, String))],rank:Int,numIterations:Int ):org.apache.spark.mllib.recommendation.MatrixFactorizationModel={
val ratings = ratings_train.map(_ match { case ((user,item),(rate,temp)) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
val model = ALS.train(ratings, rank, numIterations, 0.01)
return model
}
def print_results(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val rating_range=final_predictions_adjusted.map(x=>(x._2.toInt,1)).reduceByKey(_+_).sortByKey()
val rating_range_till_4=rating_range.map{x=>
var temp=x
if (x._1==5){temp=(4,x._2)}
temp
}.reduceByKey(_+_)
rating_range_till_4.sortByKey().foreach { x =>
if(x._1==0)
printf(">=0 and <1: " + x._2+"\n")
if(x._1==1)
printf(">=1 and <2: " + x._2+"\n")
if(x._1==2)
printf(">=2 and <3: " + x._2+"\n")
if(x._1==3)
printf(">=3 and <4: " + x._2+"\n")
if(x._1==4)
printf(">=4 " + x._2+"\n")
if(x._1==5)
printf("=5 " + x._2+"\n")
}
}
case class User_mov_rat(UserID: Int, MovieID:Int, Pred_rating: Double)
def print_outputfile(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val writer = new FileWriter(new File("./output.txt" ))
writer.write("UserID,MovieID,Pred_rating\n")
final_predictions_adjusted.collect().foreach(x=>{writer.write(x._1._1+","+x._1._2+","+x._2+"\n")})
writer.close()
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Prateek_Agrawal_task1").setMaster("local[2]")
val sc = new SparkContext(conf)
// val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-latest-small/ratings.csv"
// val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_small.csv"
val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-20m/ratings.csv"
val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_20m.csv"
val data = sc.textFile(file, 2).cache()
val data_test = sc.textFile(test, 2).cache()
// Drop Header
// val data_wo_header=dropheader(data).persist()
// val data_test_wo_header=dropheader(data_test).persist()
// Create Traing and testing data of the format (User ID, MovID, Rating, Time)
val data_wo_header=dropheader(data).map(_.split(",")).map(x=>((x(0),x(1)),(x(2),x(3))))
val data_test_wo_header=dropheader(data_test).map(_.split(",")).map(x=>((x(0),x(1)),1))
val ratings_train=data_wo_header.subtractByKey(data_test_wo_header)
val ratings_test=data_wo_header.subtractByKey(ratings_train)
data_test_wo_header.unpersist()
data_wo_header.unpersist()
data.unpersist()
data_test.unpersist()
// val ratings_split = data_wo_header.map(line => line.split(",")).persist()
// data_wo_header.unpersist()
// data.unpersist()
// val ratings_testing = data_test_wo_header.map(line => line.split(",")).collect()
// data_test_wo_header.unpersist()
// data_test.unpersist()
//
// val ratings_train = create_training(ratings_split, ratings_testing).persist()
// val ratings_test=create_testing(ratings_split, ratings_testing)
// ratings_split.unpersist()
// ratings_test.unpersist()
// Create the model using rating_train the training data
val rank = 1
val numIterations = 10
// val model=create_model(ratings_train,rank,numIterations)
//
// model.save(sc, "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/myCollaborativeFilter")
val model = MatrixFactorizationModel.load(sc, "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/myCollaborativeFilter")
// Average user,Rating from training
val user_avgrat=ratings_train.map(_ match { case ((user, mov), (rate, temp)) =>(user.toInt, (rate.toDouble,1.0))}).reduceByKey((x,y)=>(x._1 + y._1, x._2 + y._2)).mapValues{ case (sum, count) => (1.0 * sum) / count }
ratings_train.unpersist()
// Predict user_mov ratings
val user_mov = data_test_wo_header.map(_ match { case ((user, mov),temp) =>
(user.toInt,mov.toInt)
})
val predictions =
model.predict(user_mov).map { case Rating(user, mov, rate) =>
((user, mov), rate)
}
// Combine Predictions and unpredicted user,Movies due to them being individual. Going forward we need to improve the accuracy for these predictions
val user_mov_rat=user_mov.map(x=>(x,0.0))
val predictions_unpredicted_combined= predictions.union(user_mov_rat).reduceByKey(_+_).map(x=>(x._1._1,(x._1._2,x._2)))
// Combine average rating and predictions+unpredicted values
val avg_rating_predictions_unpredicted_combined=predictions_unpredicted_combined.join(user_avgrat)
// Generate final predictions RDD
val final_predictions=avg_rating_predictions_unpredicted_combined.map{x=>
var temp=((x._1,x._2._1._1),x._2._2)
if(x._2._1._2==0.0){temp=((x._1,x._2._1._1),x._2._2)}
if(x._2._1._2!=0.0){temp=((x._1,x._2._1._1),x._2._1._2)}
temp
}
// Adjust for ratings above 5.0 and below 0.0
val final_predictions_adjusted=final_predictions.map{x=>
var temp=x
if (x._2>5.0){temp=(x._1,5.0)}
if (x._2<0.0){temp=(x._1,0.0)}
temp
}
// final_predictions_adjusted.count()
val ratesAndPreds_map = ratings_test.map(_ match { case ((user, mov), (rate, temp)) => ((user.toInt,mov.toInt),rate.toDouble)})
val ratesAndPreds=ratesAndPreds_map.join(final_predictions_adjusted)
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
val RMSE=math.sqrt(MSE)
// Print output.txt
print_outputfile(final_predictions_adjusted)
// Print the predictionresults
print_results(final_predictions_adjusted.sortByKey())
print(RMSE+"\n")
}
}

Avoid duplicated computation in guards

Here is the code:
import java.util.{Calendar, Date, GregorianCalendar}
import com.mongodb.casbah.Imports._
import com.mongodb.casbah.commons.conversions.scala._
case class Quota(date: Date, used: Int)
object MongoDateDemo extends App {
val client = InsertUsers.getClient
val db = client("github")
val quota = db("quota")
val rand = scala.util.Random
// quota.drop()
// (1 to 100).foreach { _ =>
// quota += DBObject("date" -> new Date(), "used" -> rand.nextInt(10))
// Thread.sleep(1000)
// }
val minuteInMilliseconds = 60 * 1000
def thresholdDate(minute: Int) = new Date(new Date() .getTime - minuteInMilliseconds * minute) // since a minute ago
val fields = DBObject("_id" -> 0, "used" -> 1)
val x = quota.find("date" $gte thresholdDate(28), fields).collect {
case x if x.getAs[Int]("used").isDefined => x.getAs[Int]("used").get
}
println(x.toList.sum)
// val y = x.map {
// case dbo: DBObject => Quota(dbo.getAs[Date]("date").getOrElse(new Date(0)), dbo.getAs[Int]("used").getOrElse(0))
// }
}
It's reading documents from a collection and filter out those that don't have "used" defined, then summing up the numbers.
The x.getAs[Int]("used") part is duplicated computation, how can I avoid it?
Not much of a Scala programmer, but isn't that what flatMap is for?
quota
.find("date" $gte thresholdDate(38), fields)
.flatMap(_.getAs[Int]("used").toList)
Since this is not possible to avoid, I had to do it in two steps, map into Options then collect. I used view method so that the collection is not traversed twice:
import java.util.{Calendar, Date, GregorianCalendar}
import com.mongodb.casbah.Imports._
import com.mongodb.casbah.commons.conversions.scala._
case class Quota(date: Date, used: Int)
object MongoDateDemo extends App {
val client = InsertUsers.getClient
val db = client("github")
val quota = db("quota")
val rand = scala.util.Random
// quota.drop()
// (1 to 100).foreach { _ =>
// quota += DBObject("date" -> new Date(), "used" -> rand.nextInt(10))
// Thread.sleep(1000)
// }
val minuteInMilliseconds = 60 * 1000
def thresholdDate(minute: Int) = new Date(new Date() .getTime - minuteInMilliseconds * minute) // since a minute ago
val fields = DBObject("_id" -> 0, "used" -> 1)
val usedNumbers = quota.find("date" $gte thresholdDate(38), fields).toList.view.map {
_.getAs[Int]("used")
}.collect {
case Some(i) => i
}.force
println(usedNumbers.sum)
// val y = x.map {
// case dbo: DBObject => Quota(dbo.getAs[Date]("date").getOrElse(new Date(0)), dbo.getAs[Int]("used").getOrElse(0))
// }
}
Assuming getAs returns an Option, this should do what you want:
val x = quota.find("date" $gte thresholdDate(28), fields).flatMap { _.getAs[Int]("used") }
This is similar to doing this:
scala> List(Some(1), Some(2), None, Some(4)).flatMap(x => x)
res: List[Int] = List(1, 2, 4)
Or this:
scala> (1 to 20).flatMap(x => if(x%2 == 0) Some(x) else None)
res: scala.collection.immutable.IndexedSeq[Int] = Vector(2, 4, 6, 8, 10, 12, 14, 16, 18, 20)

SparkContext cannot be launched in the same programe with Streaming SparkContext

I created the following test that fit a simple linear regression model to a dummy streaming data.
I use hyper-parameters optimisation to find good values of stepSize, numiterations and initialWeights of the linear model.
Everything runs fine, except the last lines of the code that are commented out:
// Save the evaluations for further visualization
// val gridEvalsRDD = sc.parallelize(gridEvals)
// gridEvalsRDD.coalesce(1)
// .map(e => "%.3f\t%.3f\t%d\t%.3f".format(e._1, e._2, e._3, e._4))
// .saveAsTextFile("data/mllib/streaming")
The problem is with the SparkContext sc. If I initialize it at the beginning of a test, then the program shown errors. It looks like sc should be defined in some special way in order to avoid conflicts with scc (streaming spark context). Any ideas?
The whole code:
// scalastyle:off
package org.apache.spark.mllib.regression
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
import org.apache.spark.streaming.TestSuiteBase
import org.scalatest.BeforeAndAfter
class StreamingLinearRegressionHypeOpt extends TestSuiteBase with BeforeAndAfter {
// use longer wait time to ensure job completion
override def maxWaitTimeMillis: Int = 20000
var ssc: StreamingContext = _
override def afterFunction() {
super.afterFunction()
if (ssc != null) {
ssc.stop()
}
}
def calculateMSE(output: Seq[Seq[(Double, Double)]], n: Int): Double = {
val mse = output
.map {
case seqOfPairs: Seq[(Double, Double)] =>
val err = seqOfPairs.map(p => math.abs(p._1 - p._2)).sum
err*err
}.sum / n
mse
}
def calculateRMSE(output: Seq[Seq[(Double, Double)]], n: Int): Double = {
val mse = output
.map {
case seqOfPairs: Seq[(Double, Double)] =>
val err = seqOfPairs.map(p => math.abs(p._1 - p._2)).sum
err*err
}.sum / n
math.sqrt(mse)
}
def dummyStringStreamSplit(datastream: Stream[String]) =
datastream.flatMap(txt => txt.split(" "))
test("Test 1") {
// create model initialized with zero weights
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(0.0, 0.0))
.setStepSize(0.2)
.setNumIterations(25)
// generate sequence of simulated data for testing
val numBatches = 10
val nPoints = 100
val inputData = (0 until numBatches).map { i =>
LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), nPoints, 42 * (i + 1))
}
// Without hyper-parameters optimization
withStreamingContext(setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
model.trainOn(inputDStream)
model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})) { ssc =>
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val rmse = calculateRMSE(output, nPoints)
println(s"RMSE = $rmse")
}
// With hyper-parameters optimization
val gridParams = Map(
"initialWeights" -> List(Vectors.dense(0.0, 0.0), Vectors.dense(10.0, 10.0)),
"stepSize" -> List(0.1, 0.2, 0.3),
"numIterations" -> List(25, 50)
)
val gridEvals = for (initialWeights <- gridParams("initialWeights");
stepSize <- gridParams("stepSize");
numIterations <- gridParams("numIterations")) yield {
val lr = new StreamingLinearRegressionWithSGD()
.setInitialWeights(initialWeights.asInstanceOf[Vector])
.setStepSize(stepSize.asInstanceOf[Double])
.setNumIterations(numIterations.asInstanceOf[Int])
withStreamingContext(setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
lr.trainOn(inputDStream)
lr.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})) { ssc =>
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val cvRMSE = calculateRMSE(output, nPoints)
println(s"RMSE = $cvRMSE")
(initialWeights, stepSize, numIterations, cvRMSE)
}
}
// Save the evaluations for further visualization
// val gridEvalsRDD = sc.parallelize(gridEvals)
// gridEvalsRDD.coalesce(1)
// .map(e => "%.3f\t%.3f\t%d\t%.3f".format(e._1, e._2, e._3, e._4))
// .saveAsTextFile("data/mllib/streaming")
}
}
// scalastyle:on