val patterns = ctx.getBroadcastState(patternStateDescriptor)
The imports I made
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{MapStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.api.scala.typeutils.Types
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
Here's the code
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers","localhost:9092")
val patternStream = new FlinkKafkaConsumer010("patterns", new SimpleStringSchema, properties)
val patterns = env.addSource(patternStream)
var patternData = patterns.map {
str =>
val splitted_str = str.split(",")
PatternStream(splitted_str(0).trim, splitted_str(1).trim, splitted_str(2).trim)
}
val logsStream = new FlinkKafkaConsumer010("logs", new SimpleStringSchema, properties)
// logsStream.setStartFromEarliest()
val logs = env.addSource(logsStream)
var data = logs.map {
str =>
val splitted_str = str.split(",")
LogsTest(splitted_str.head.trim, splitted_str(1).trim, splitted_str(2).trim)
}
val keyedData: KeyedStream[LogsTest, String] = data.keyBy(_.metric)
val bcStateDescriptor = new MapStateDescriptor[Unit, PatternStream]("patterns", Types.UNIT, Types.of[PatternStream]) // first type defined is for the key and second data type defined is for the value
val broadcastPatterns: BroadcastStream[PatternStream] = patternData.broadcast(bcStateDescriptor)
val alerts = keyedData
.connect(broadcastPatterns)
.process(new PatternEvaluator())
alerts.print()
// println(alerts.getClass)
// val sinkProducer = new FlinkKafkaProducer010("output", new SimpleStringSchema(), properties)
env.execute("Flink Broadcast State Job")
}
class PatternEvaluator()
extends KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)] {
private lazy val patternStateDescriptor = new MapStateDescriptor("patterns", classOf[String], classOf[String])
private var lastMetricState: ValueState[String] = _
override def open(parameters: Configuration): Unit = {
val lastMetricDescriptor = new ValueStateDescriptor("last-metric", classOf[String])
lastMetricState = getRuntimeContext.getState(lastMetricDescriptor)
}
override def processElement(reading: LogsTest,
readOnlyCtx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#ReadOnlyContext,
out: Collector[(String, String, String)]): Unit = {
val metrics = readOnlyCtx.getBroadcastState(patternStateDescriptor)
if (metrics.contains(reading.metric)) {
val metricPattern: String = metrics.get(reading.metric)
val metricPatternValue: String = metrics.get(reading.value)
val lastMetric = lastMetricState.value()
val logsMetric = (reading.metric)
val logsValue = (reading.value)
if (logsMetric == metricPattern) {
if (metricPatternValue == logsValue) {
out.collect((reading.timestamp, reading.value, reading.metric))
}
}
}
}
override def processBroadcastElement(
update: PatternStream,
ctx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#Context,
out: Collector[(String, String, String)]
): Unit = {
val patterns = ctx.getBroadcastState(patternStateDescriptor)
if (update.metric == "IP") {
patterns.put(update.metric /*,update.operator*/ , update.value)
}
// else if (update.metric == "username"){
// patterns.put(update.metric, update.value)
// }
// else {
// println("No required data found")
// }
// }
}
}
Sample Data :- Logs Stream
"21/09/98","IP", "5.5.5.5"
Pattern Stream
"IP","==","5.5.5.5"
I'm unable to analyse data by getting desired result, i.e = 21/09/98,IP,5.5.5.5
There's no error as of now, it's just not analysing the data
The code is reading streams (Checked)
One common source of trouble in cases like this is that the API offers no control over the order in which the patterns and the data are ingested. It could be that processElement is being called before processBroadcastElement.
I'm new to Akka Stream. I used following code for CSV parsing.
class CsvParser(config: Config)(implicit system: ActorSystem) extends LazyLogging with NumberValidation {
import system.dispatcher
private val importDirectory = Paths.get(config.getString("importer.import-directory")).toFile
private val linesToSkip = config.getInt("importer.lines-to-skip")
private val concurrentFiles = config.getInt("importer.concurrent-files")
private val concurrentWrites = config.getInt("importer.concurrent-writes")
private val nonIOParallelism = config.getInt("importer.non-io-parallelism")
def save(r: ValidReading): Future[Unit] = {
Future()
}
def parseLine(filePath: String)(line: String): Future[Reading] = Future {
val fields = line.split(";")
val id = fields(0).toInt
try {
val value = fields(1).toDouble
ValidReading(id, value)
} catch {
case t: Throwable =>
logger.error(s"Unable to parse line in $filePath:\n$line: ${t.getMessage}")
InvalidReading(id)
}
}
val lineDelimiter: Flow[ByteString, ByteString, NotUsed] =
Framing.delimiter(ByteString("\n"), 128, allowTruncation = true)
val parseFile: Flow[File, Reading, NotUsed] =
Flow[File].flatMapConcat { file =>
val src = FileSource.fromFile(file).getLines()
val source : Source[String, NotUsed] = Source.fromIterator(() => src)
// val gzipInputStream = new GZIPInputStream(new FileInputStream(file))
source
.mapAsync(parallelism = nonIOParallelism)(parseLine(file.getPath))
}
val computeAverage: Flow[Reading, ValidReading, NotUsed] =
Flow[Reading].grouped(2).mapAsyncUnordered(parallelism = nonIOParallelism) { readings =>
Future {
val validReadings = readings.collect { case r: ValidReading => r }
val average = if (validReadings.nonEmpty) validReadings.map(_.value).sum / validReadings.size else -1
ValidReading(readings.head.id, average)
}
}
val storeReadings: Sink[ValidReading, Future[Done]] =
Flow[ValidReading]
.mapAsyncUnordered(concurrentWrites)(save)
.toMat(Sink.ignore)(Keep.right)
val processSingleFile: Flow[File, ValidReading, NotUsed] =
Flow[File]
.via(parseFile)
.via(computeAverage)
def importFromFiles = {
implicit val materializer = ActorMaterializer()
val files = importDirectory.listFiles.toList
logger.info(s"Starting import of ${files.size} files from ${importDirectory.getPath}")
val startTime = System.currentTimeMillis()
val balancer = GraphDSL.create() { implicit builder =>
import GraphDSL.Implicits._
val balance = builder.add(Balance[File](concurrentFiles))
val merge = builder.add(Merge[ValidReading](concurrentFiles))
(1 to concurrentFiles).foreach { _ =>
balance ~> processSingleFile ~> merge
}
FlowShape(balance.in, merge.out)
}
Source(files)
.via(balancer)
.withAttributes(ActorAttributes.supervisionStrategy { e =>
logger.error("Exception thrown during stream processing", e)
Supervision.Resume
})
.runWith(storeReadings)
.andThen {
case Success(_) =>
val elapsedTime = (System.currentTimeMillis() - startTime) / 1000.0
logger.info(s"Import finished in ${elapsedTime}s")
case Failure(e) => logger.error("Import failed", e)
}
}
}
I wanted to to use Akka HTTP which would give all ValidReading entities parsed from CSV but I couldn't understand on how would I do that.
The above code fetches file from server and parse each lines to generate ValidReading.
How can I pass/upload CSV via akka-http, parse the file and stream the resulted response back to the endpoint?
The "essence" of the solution is something like this:
import akka.http.scaladsl.server.Directives._
val route = fileUpload("csv") {
case (metadata, byteSource) =>
val source = byteSource.map(x => x)
complete(HttpResponse(entity = HttpEntity(ContentTypes.`text/csv(UTF-8)`, source)))
}
You detect that the uploaded thing is a multipart-form-data with a chunk named "csv". You get the byteSource from that. Do the calculation (insert your logic to the .map(x=>x) part). Convert your data back to ByteString. Complete the request with the new source. This will make your endoint like a proxy.
I'm trying out to generate a simple ALS model using the spark documentation here.
My first file(ratings.csv) has 20million UserID,MovID,Rat and can be downloaded here
So I have the testing data which is a subset of ratings.csv. That test dataset can be downloaded here:
The test file has just the UserID, Movie ID column.
So to create training data we will have to filter ratings.csv.
The following code is working fine for a smaller case of 100,000 UserID,MovID rating. I am not able to generate the model for the big case.
Please help with a pointer.
/**
* Created by echoesofconc on 3/8/17.
*/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import java.io._
import scala.collection.mutable.ListBuffer
object Prateek_Agrawal_task1 {
def dropheader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
def create_training(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == false
})
}
def create_testing(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == true
})
}
def create_model(ratings_train:RDD[Array[String]],rank:Int,numIterations:Int ):org.apache.spark.mllib.recommendation.MatrixFactorizationModel={
val ratings = ratings_train.map(_ match { case Array(user,item,rate,temp) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
val model = ALS.train(ratings, rank, numIterations, 0.01)
return model
}
def print_results(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val rating_range=final_predictions_adjusted.map(x=>(x._2.toInt,1)).reduceByKey(_+_).sortByKey()
val rating_range_till_4=rating_range.map{x=>
var temp=x
if (x._1==5){temp=(4,x._2)}
temp
}.reduceByKey(_+_)
rating_range_till_4.sortByKey().foreach { x =>
if(x._1==0)
printf(">=0 and <1: " + x._2+"\n")
if(x._1==1)
printf(">=1 and <2: " + x._2+"\n")
if(x._1==2)
printf(">=2 and <3: " + x._2+"\n")
if(x._1==3)
printf(">=3 and <4: " + x._2+"\n")
if(x._1==4)
printf(">=4 " + x._2+"\n")
if(x._1==5)
printf("=5 " + x._2+"\n")
}
}
case class User_mov_rat(UserID: Int, MovieID:Int, Pred_rating: Double)
def print_outputfile(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val writer = new FileWriter(new File("./output.txt" ))
writer.write("UserID,MovieID,Pred_rating\n")
final_predictions_adjusted.collect().foreach(x=>{writer.write(x._1._1+","+x._1._2+","+x._2+"\n")})
writer.close()
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Prateek_Agrawal_task1").setMaster("local[2]")
val sc = new SparkContext(conf)
val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-20m/ratings.csv"
val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_20m.csv"
val data = sc.textFile(file, 2).cache()
val data_test = sc.textFile(test, 2).cache()
// Drop Header
val data_wo_header=dropheader(data).persist()
val data_test_wo_header=dropheader(data_test).persist()
// Create Training and testing data of the format (User ID, MovID, Rating, Time)
val ratings_split = data_wo_header.map(line => line.split(",")).persist()
data_wo_header.unpersist()
data.unpersist()
val ratings_testing = data_test_wo_header.map(line => line.split(",")).collect()
data_test_wo_header.unpersist()
data_test.unpersist()
val ratings_train = create_training(ratings_split, ratings_testing).persist()
val ratings_test=create_testing(ratings_split, ratings_testing)
ratings_split.unpersist()
ratings_test.unpersist()
// Create the model using rating_train the training data
val rank = 1
val numIterations = 10
val model=create_model(ratings_train,rank,numIterations)
ratings_train.unpersist()
// Average user,Rating from training this is for cases which are there in test but not rated by any user in training
val user_avgrat=ratings_test.map(_ match { case Array(user, mov, rate, temp) =>(user.toInt, (rate.toDouble,1.0))}).reduceByKey((x,y)=>(x._1 + y._1, x._2 + y._2)).mapValues{ case (sum, count) => (1.0 * sum) / count }
// Predict user_mov ratings
val user_mov = data_test_wo_header.map(_.split(',') match { case Array(user, mov) =>
(user.toInt,mov.toInt)
})
val predictions =
model.predict(user_mov).map { case Rating(user, mov, rate) =>
((user, mov), rate)
}
// Combine Predictions and unpredicted user,Movies due to them being individual. Going forward we need to improve the accuracy for these predictions
val user_mov_rat=user_mov.map(x=>(x,0.0))
val predictions_unpredicted_combined= predictions.union(user_mov_rat).reduceByKey(_+_).map(x=>(x._1._1,(x._1._2,x._2)))
// Combine average rating and predictions+unpredicted values
val avg_rating_predictions_unpredicted_combined=predictions_unpredicted_combined.join(user_avgrat)
// Generate final predictions RDD
val final_predictions=avg_rating_predictions_unpredicted_combined.map{x=>
var temp=((x._1,x._2._1._1),x._2._2)
if(x._2._1._2==0.0){temp=((x._1,x._2._1._1),x._2._2)}
if(x._2._1._2!=0.0){temp=((x._1,x._2._1._1),x._2._1._2)}
temp
}
// Adjust for ratings above 5.0 and below 0.0
val final_predictions_adjusted=final_predictions.map{x=>
var temp=x
if (x._2>5.0){temp=(x._1,5.0)}
if (x._2<0.0){temp=(x._1,0.0)}
temp
}
val ratesAndPreds = ratings_test.map(_ match { case Array(user, mov, rate, temp) => ((user.toInt,mov.toInt),rate.toDouble)}).join(final_predictions_adjusted)
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
val RMSE=math.sqrt(MSE)
// Print output.txt
print_outputfile(final_predictions_adjusted)
// Print the predictionresults
print_results(final_predictions_adjusted.sortByKey())
print(RMSE+"\n")
}
}
In case someone thinks I should be doing a regex match I have tried that approach. BUt that dosen't seem to be a bottleneck.
I only need to complete the create model part on which I am stuck for the big dataset. Can somebody help.
EDIT:
Another approach I tried which is much faster by using broadcast variables. But it's been running for 12 hrs with no signs of progress. On spark UI somehow the whole of the RDD(ratings.csv ~500MB) is not cached. Only around 64MB with 2.5 Million lines is being processed initially. I am using --executor-memory -8g. I have modified the create_training create_testing functions:
/**
* Created by echoesofconc on 3/8/17.
*/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import java.io._
object Prateek_Agrawal_task2 {
def dropheader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
def create_training(data_wo_header: RDD[String], data_test_wo_header: RDD[String],sc:SparkContext): RDD[String] = {
val rdd2array = sc.broadcast(data_test_wo_header.collect())
val training_set = data_wo_header.filter{
case(x) => rdd2array.value.filter(y => x.indexOf(y.toString())==0).length == 0
}
return training_set
}
def create_test(data_wo_header: RDD[String], data_test_wo_header: RDD[String],sc:SparkContext): RDD[String] = {
val rdd2array = sc.broadcast(data_test_wo_header.collect())
val training_set = data_wo_header.filter{
case(x) => rdd2array.value.filter(y => x.indexOf(y.toString())==0).length != 0
}
return training_set
}
def create_model(ratings_train:RDD[String],rank:Int,numIterations:Int ):org.apache.spark.mllib.recommendation.MatrixFactorizationModel={
val ratings = ratings_train.map(_.split(',') match { case Array(user, item, rate, timestamp) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
val model = ALS.train(ratings, rank, numIterations, 0.01)
return model
}
def print_results(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val rating_range=final_predictions_adjusted.map(x=>(x._2.toInt,1)).reduceByKey(_+_).sortByKey()
val rating_range_till_4=rating_range.map{x=>
var temp=x
if (x._1==5){temp=(4,x._2)}
temp
}.reduceByKey(_+_)
rating_range_till_4.sortByKey().foreach { x =>
if(x._1==0)
printf(">=0 and <1: " + x._2+"\n")
if(x._1==1)
printf(">=1 and <2: " + x._2+"\n")
if(x._1==2)
printf(">=2 and <3: " + x._2+"\n")
if(x._1==3)
printf(">=3 and <4: " + x._2+"\n")
if(x._1==4)
printf(">=4 " + x._2+"\n")
if(x._1==5)
printf("=5 " + x._2+"\n")
}
}
case class User_mov_rat(UserID: Int, MovieID:Int, Pred_rating: Double)
def print_outputfile(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val writer = new FileWriter(new File("./output.txt" ))
writer.write("UserID,MovieID,Pred_rating\n")
final_predictions_adjusted.collect().foreach(x=>{writer.write(x._1._1+","+x._1._2+","+x._2+"\n")})
writer.close()
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Prateek_Agrawal_task1").setMaster("local[2]")
val sc = new SparkContext(conf)
val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-latest-small/ratings.csv"
val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_small.csv"
// val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-20m/ratings.csv"
// val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_20m.csv"
val data = sc.textFile(file, 2).persist()
val data_test = sc.textFile(test, 2).persist()
// Drop Header
val data_wo_header=dropheader(data)
val data_test_wo_header=dropheader(data_test)
// Create Traing and testing data of the format (User ID, MovID, Rating, Time)
val ratings_train=create_training(data_wo_header,data_test_wo_header,sc).persist()
val ratings_test=create_test(data_wo_header,data_test_wo_header,sc)
// val ratings_test=create_test(data_wo_header,data_test_wo_header,sc)
// data_test_wo_header.unpersist()
// data_test.unpersist()
//// data.unpersist()
//// data_test.unpersist()
// Create the model using rating_train the training data
val rank = 1
val numIterations = 10
val model=create_model(ratings_train,rank,numIterations)
// ratings_train.unpersist()
// model.save(sc, "target/tmp/myCollaborativeFilter")
// val Model = MatrixFactorizationModel.load(sc, "/Users/echoesofconc/myCollaborativeFilter")
// Average user,Rating from training
val user_avgrat=ratings_test.map(_.split(",") match { case Array(user, mov, rate, temp) =>(user.toInt, (rate.toDouble,1.0))}).reduceByKey((x,y)=>(x._1 + y._1, x._2 + y._2)).mapValues{ case (sum, count) => (1.0 * sum) / count }
//data
// Predict user_mov ratings
val user_mov = data_test_wo_header.map(_.split(',') match { case Array(user, mov) =>
(user.toInt,mov.toInt)
})
val predictions =
model.predict(user_mov).map { case Rating(user, mov, rate) =>
((user, mov), rate)
}
// Combine Predictions and unpredicted user,Movies due to them being individual. Going forward we need to improve the accuracy for these predictions
val user_mov_rat=user_mov.map(x=>(x,0.0))
val predictions_unpredicted_combined= predictions.union(user_mov_rat).reduceByKey(_+_).map(x=>(x._1._1,(x._1._2,x._2)))
// Combine average rating and predictions+unpredicted values
val avg_rating_predictions_unpredicted_combined=predictions_unpredicted_combined.join(user_avgrat)
// Generate final predictions RDD
val final_predictions=avg_rating_predictions_unpredicted_combined.map{x=>
var temp=((x._1,x._2._1._1),x._2._2)
if(x._2._1._2==0.0){temp=((x._1,x._2._1._1),x._2._2)}
if(x._2._1._2!=0.0){temp=((x._1,x._2._1._1),x._2._1._2)}
temp
}
// Adjust for ratings above 5.0 and below 0.0
val final_predictions_adjusted=final_predictions.map{x=>
var temp=x
if (x._2>5.0){temp=(x._1,5.0)}
if (x._2<0.0){temp=(x._1,0.0)}
temp
}
val ratesAndPreds = ratings_test.map(_.split(",") match { case Array(user, mov, rate, temp) => ((user.toInt,mov.toInt),rate.toDouble)}).join(final_predictions_adjusted)
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
val RMSE=math.sqrt(MSE)
// Print output.txt
print_outputfile(final_predictions_adjusted)
// Print the predictionresults
print_results(final_predictions_adjusted.sortByKey())
print(RMSE+"\n")
}
}
This worked out to be fine. It's using join to create testng training data
/**
* Created by echoesofconc on 3/8/17.
*/
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import java.io._
object Prateek_Agrawal_task1 {
def dropheader(data: RDD[String]): RDD[String] = {
data.mapPartitionsWithIndex((idx, lines) => {
if (idx == 0) {
lines.drop(1)
}
lines
})
}
def create_training(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == false
})
}
def create_testing(ratings_split: RDD[Array[String]], ratings_testing: Array[Array[String]]) = {
ratings_split.filter(x => {
ratings_testing.exists(y =>
(x(0) == y(0) && x(1) == y(1))
) == true
})
}
def create_model(ratings_train:RDD[((String, String), (String, String))],rank:Int,numIterations:Int ):org.apache.spark.mllib.recommendation.MatrixFactorizationModel={
val ratings = ratings_train.map(_ match { case ((user,item),(rate,temp)) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
val model = ALS.train(ratings, rank, numIterations, 0.01)
return model
}
def print_results(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val rating_range=final_predictions_adjusted.map(x=>(x._2.toInt,1)).reduceByKey(_+_).sortByKey()
val rating_range_till_4=rating_range.map{x=>
var temp=x
if (x._1==5){temp=(4,x._2)}
temp
}.reduceByKey(_+_)
rating_range_till_4.sortByKey().foreach { x =>
if(x._1==0)
printf(">=0 and <1: " + x._2+"\n")
if(x._1==1)
printf(">=1 and <2: " + x._2+"\n")
if(x._1==2)
printf(">=2 and <3: " + x._2+"\n")
if(x._1==3)
printf(">=3 and <4: " + x._2+"\n")
if(x._1==4)
printf(">=4 " + x._2+"\n")
if(x._1==5)
printf("=5 " + x._2+"\n")
}
}
case class User_mov_rat(UserID: Int, MovieID:Int, Pred_rating: Double)
def print_outputfile(final_predictions_adjusted:RDD[((Int, Int), Double)])={
val writer = new FileWriter(new File("./output.txt" ))
writer.write("UserID,MovieID,Pred_rating\n")
final_predictions_adjusted.collect().foreach(x=>{writer.write(x._1._1+","+x._1._2+","+x._2+"\n")})
writer.close()
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Prateek_Agrawal_task1").setMaster("local[2]")
val sc = new SparkContext(conf)
// val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-latest-small/ratings.csv"
// val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_small.csv"
val file = "/Users/echoesofconc/Documents/USC_courses/INF553/ml-20m/ratings.csv"
val test = "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/testing_20m.csv"
val data = sc.textFile(file, 2).cache()
val data_test = sc.textFile(test, 2).cache()
// Drop Header
// val data_wo_header=dropheader(data).persist()
// val data_test_wo_header=dropheader(data_test).persist()
// Create Traing and testing data of the format (User ID, MovID, Rating, Time)
val data_wo_header=dropheader(data).map(_.split(",")).map(x=>((x(0),x(1)),(x(2),x(3))))
val data_test_wo_header=dropheader(data_test).map(_.split(",")).map(x=>((x(0),x(1)),1))
val ratings_train=data_wo_header.subtractByKey(data_test_wo_header)
val ratings_test=data_wo_header.subtractByKey(ratings_train)
data_test_wo_header.unpersist()
data_wo_header.unpersist()
data.unpersist()
data_test.unpersist()
// val ratings_split = data_wo_header.map(line => line.split(",")).persist()
// data_wo_header.unpersist()
// data.unpersist()
// val ratings_testing = data_test_wo_header.map(line => line.split(",")).collect()
// data_test_wo_header.unpersist()
// data_test.unpersist()
//
// val ratings_train = create_training(ratings_split, ratings_testing).persist()
// val ratings_test=create_testing(ratings_split, ratings_testing)
// ratings_split.unpersist()
// ratings_test.unpersist()
// Create the model using rating_train the training data
val rank = 1
val numIterations = 10
// val model=create_model(ratings_train,rank,numIterations)
//
// model.save(sc, "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/myCollaborativeFilter")
val model = MatrixFactorizationModel.load(sc, "/Users/echoesofconc/Documents/USC_courses/INF553/Prateek_Agrawal_hw3/myCollaborativeFilter")
// Average user,Rating from training
val user_avgrat=ratings_train.map(_ match { case ((user, mov), (rate, temp)) =>(user.toInt, (rate.toDouble,1.0))}).reduceByKey((x,y)=>(x._1 + y._1, x._2 + y._2)).mapValues{ case (sum, count) => (1.0 * sum) / count }
ratings_train.unpersist()
// Predict user_mov ratings
val user_mov = data_test_wo_header.map(_ match { case ((user, mov),temp) =>
(user.toInt,mov.toInt)
})
val predictions =
model.predict(user_mov).map { case Rating(user, mov, rate) =>
((user, mov), rate)
}
// Combine Predictions and unpredicted user,Movies due to them being individual. Going forward we need to improve the accuracy for these predictions
val user_mov_rat=user_mov.map(x=>(x,0.0))
val predictions_unpredicted_combined= predictions.union(user_mov_rat).reduceByKey(_+_).map(x=>(x._1._1,(x._1._2,x._2)))
// Combine average rating and predictions+unpredicted values
val avg_rating_predictions_unpredicted_combined=predictions_unpredicted_combined.join(user_avgrat)
// Generate final predictions RDD
val final_predictions=avg_rating_predictions_unpredicted_combined.map{x=>
var temp=((x._1,x._2._1._1),x._2._2)
if(x._2._1._2==0.0){temp=((x._1,x._2._1._1),x._2._2)}
if(x._2._1._2!=0.0){temp=((x._1,x._2._1._1),x._2._1._2)}
temp
}
// Adjust for ratings above 5.0 and below 0.0
val final_predictions_adjusted=final_predictions.map{x=>
var temp=x
if (x._2>5.0){temp=(x._1,5.0)}
if (x._2<0.0){temp=(x._1,0.0)}
temp
}
// final_predictions_adjusted.count()
val ratesAndPreds_map = ratings_test.map(_ match { case ((user, mov), (rate, temp)) => ((user.toInt,mov.toInt),rate.toDouble)})
val ratesAndPreds=ratesAndPreds_map.join(final_predictions_adjusted)
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
val RMSE=math.sqrt(MSE)
// Print output.txt
print_outputfile(final_predictions_adjusted)
// Print the predictionresults
print_results(final_predictions_adjusted.sortByKey())
print(RMSE+"\n")
}
}
I created the following test that fit a simple linear regression model to a dummy streaming data.
I use hyper-parameters optimisation to find good values of stepSize, numiterations and initialWeights of the linear model.
Everything runs fine, except the last lines of the code that are commented out:
// Save the evaluations for further visualization
// val gridEvalsRDD = sc.parallelize(gridEvals)
// gridEvalsRDD.coalesce(1)
// .map(e => "%.3f\t%.3f\t%d\t%.3f".format(e._1, e._2, e._3, e._4))
// .saveAsTextFile("data/mllib/streaming")
The problem is with the SparkContext sc. If I initialize it at the beginning of a test, then the program shown errors. It looks like sc should be defined in some special way in order to avoid conflicts with scc (streaming spark context). Any ideas?
The whole code:
// scalastyle:off
package org.apache.spark.mllib.regression
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
import org.apache.spark.streaming.TestSuiteBase
import org.scalatest.BeforeAndAfter
class StreamingLinearRegressionHypeOpt extends TestSuiteBase with BeforeAndAfter {
// use longer wait time to ensure job completion
override def maxWaitTimeMillis: Int = 20000
var ssc: StreamingContext = _
override def afterFunction() {
super.afterFunction()
if (ssc != null) {
ssc.stop()
}
}
def calculateMSE(output: Seq[Seq[(Double, Double)]], n: Int): Double = {
val mse = output
.map {
case seqOfPairs: Seq[(Double, Double)] =>
val err = seqOfPairs.map(p => math.abs(p._1 - p._2)).sum
err*err
}.sum / n
mse
}
def calculateRMSE(output: Seq[Seq[(Double, Double)]], n: Int): Double = {
val mse = output
.map {
case seqOfPairs: Seq[(Double, Double)] =>
val err = seqOfPairs.map(p => math.abs(p._1 - p._2)).sum
err*err
}.sum / n
math.sqrt(mse)
}
def dummyStringStreamSplit(datastream: Stream[String]) =
datastream.flatMap(txt => txt.split(" "))
test("Test 1") {
// create model initialized with zero weights
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(0.0, 0.0))
.setStepSize(0.2)
.setNumIterations(25)
// generate sequence of simulated data for testing
val numBatches = 10
val nPoints = 100
val inputData = (0 until numBatches).map { i =>
LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), nPoints, 42 * (i + 1))
}
// Without hyper-parameters optimization
withStreamingContext(setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
model.trainOn(inputDStream)
model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})) { ssc =>
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val rmse = calculateRMSE(output, nPoints)
println(s"RMSE = $rmse")
}
// With hyper-parameters optimization
val gridParams = Map(
"initialWeights" -> List(Vectors.dense(0.0, 0.0), Vectors.dense(10.0, 10.0)),
"stepSize" -> List(0.1, 0.2, 0.3),
"numIterations" -> List(25, 50)
)
val gridEvals = for (initialWeights <- gridParams("initialWeights");
stepSize <- gridParams("stepSize");
numIterations <- gridParams("numIterations")) yield {
val lr = new StreamingLinearRegressionWithSGD()
.setInitialWeights(initialWeights.asInstanceOf[Vector])
.setStepSize(stepSize.asInstanceOf[Double])
.setNumIterations(numIterations.asInstanceOf[Int])
withStreamingContext(setupStreams(inputData, (inputDStream: DStream[LabeledPoint]) => {
lr.trainOn(inputDStream)
lr.predictOnValues(inputDStream.map(x => (x.label, x.features)))
})) { ssc =>
val output: Seq[Seq[(Double, Double)]] = runStreams(ssc, numBatches, numBatches)
val cvRMSE = calculateRMSE(output, nPoints)
println(s"RMSE = $cvRMSE")
(initialWeights, stepSize, numIterations, cvRMSE)
}
}
// Save the evaluations for further visualization
// val gridEvalsRDD = sc.parallelize(gridEvals)
// gridEvalsRDD.coalesce(1)
// .map(e => "%.3f\t%.3f\t%d\t%.3f".format(e._1, e._2, e._3, e._4))
// .saveAsTextFile("data/mllib/streaming")
}
}
// scalastyle:on