Computing Quartiles over Windowed Dataframe

Computing Quartiles over Windowed Dataframe - scala

I have some data, for the sake of discussion take it to be given by:
val schema = Seq("id", "day", "value")
val data = Seq(
(1, 1, 1),
(1, 2, 11),
(1, 3, 1),
(1, 4, 11),
(1, 5, 1),
(1, 6, 11),
(2, 1, 1),
(2, 2, 11),
(2, 3, 1),
(2, 4, 11),
(2, 5, 1),
(2, 6, 11)
)
val df = sc.parallelize(data).toDF(schema: _*)
I would like to compute quartiles for each ID over a moving window of days. Something like
val w = Window.partitionBy("id").orderBy("day").rangeBetween(-2, 0)
df.select(col("id"),col("day"),collect_list(col("value")).over(w),quartiles(col("value")).over(w).as("Quartiles"))
Of course there isn't a quartiles function for this so I need to write a UserDefinedAggregateFunction. The following is a simple (albeit non-scalable) solution (based on this) CollectionFunction
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
class QuartilesFunction extends UserDefinedAggregateFunction {
def inputSchema: StructType =
StructType(StructField("value", DoubleType, false) :: Nil)
def bufferSchema: StructType = StructType(StructField("lower", ArrayType(DoubleType, true), true) :: StructField("upper", ArrayType(DoubleType, true), true) :: Nil)
override def dataType: DataType = ArrayType(DoubleType, true)
def deterministic: Boolean = true
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = IndexedSeq[Double]()
buffer(1) = IndexedSeq[Double]()
}
def rebalance(lower : IndexedSeq[Double], upper : IndexedSeq[Double]) = {
(lower++upper).splitAt((lower.length+upper.length)/2)
}
def sorted_median(x : IndexedSeq[Double]) : Option[Double] = {
if(x.length == 0) {
None
}
val N = x.length
val (lower, upper) = x.splitAt(N/2)
Some(
if(N%2==0) {
(lower.last+upper.head)/2.0
} else {
upper.head
}
)
}
// this is how to update the buffer given an input
def update(buffer : MutableAggregationBuffer, input : Row) : Unit = {
val lower = buffer(0).asInstanceOf[IndexedSeq[Double]]
val upper = buffer(1).asInstanceOf[IndexedSeq[Double]]
val value = input.getAs[Double](0)
if(lower.length == 0) {
buffer(0) = Array(value)
} else {
if(value >= lower.last) {
buffer(1) = (value +: upper).sortWith(_<_)
} else {
buffer(0) = (lower :+ value).sortWith(_<_)
}
}
val (result0,result1) = rebalance(buffer(0).asInstanceOf[IndexedSeq[Double]],buffer(1).asInstanceOf[IndexedSeq[Double]])
buffer(0) = result0
buffer(1) = result1
}
// this is how to merge two objects with the buffer schema type
def merge(buffer1 : MutableAggregationBuffer, buffer2 : Row) : Unit = {
buffer1(0) = buffer1(0).asInstanceOf[IndexedSeq[Double]] ++ buffer2(0).asInstanceOf[IndexedSeq[Double]]
buffer1(1) = buffer1(1).asInstanceOf[IndexedSeq[Double]] ++ buffer2(1).asInstanceOf[IndexedSeq[Double]]
val (result0,result1) = rebalance(buffer1(0).asInstanceOf[IndexedSeq[Double]],buffer1(1).asInstanceOf[IndexedSeq[Double]])
buffer1(0) = result0
buffer1(1) = result1
}
def evaluate(buffer: Row): Array[Option[Double]] = {
val lower =
if (buffer(0) == null) {
IndexedSeq[Double]()
} else {
buffer(0).asInstanceOf[IndexedSeq[Double]]
}
val upper =
if (buffer(1) == null) {
IndexedSeq[Double]()
} else {
buffer(1).asInstanceOf[IndexedSeq[Double]]
}
val Q1 = sorted_median(lower)
val Q2 = if(upper.length==0) { None } else { Some(upper.head) }
val Q3 = sorted_median(upper)
Array(Q1,Q2,Q3)
}
}
However, executing the following produces an error:
val quartiles = new QuartilesFunction
df.select('*).show
val w = org.apache.spark.sql.expressions.Window.partitionBy("id").orderBy("day").rangeBetween(-2, 0)
val x = df.select(col("id"),col("day"),collect_list(col("value")).over(w),quartiles(col("value")).over(w).as("Quantiles"))
x.show
The error is:
org.apache.spark.SparkException: Task not serializable
The offending function seems to be sorted_median. If I replace the code with:
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
class QuartilesFunction extends UserDefinedAggregateFunction {
def inputSchema: StructType =
StructType(StructField("value", DoubleType, false) :: Nil)
def bufferSchema: StructType = StructType(StructField("lower", ArrayType(DoubleType, true), true) :: StructField("upper", ArrayType(DoubleType, true), true) :: Nil)
override def dataType: DataType = ArrayType(DoubleType, true)
def deterministic: Boolean = true
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = IndexedSeq[Double]()
buffer(1) = IndexedSeq[Double]()
}
def rebalance(lower : IndexedSeq[Double], upper : IndexedSeq[Double]) = {
(lower++upper).splitAt((lower.length+upper.length)/2)
}
/*
def sorted_median(x : IndexedSeq[Double]) : Option[Double] = {
if(x.length == 0) {
None
}
val N = x.length
val (lower, upper) = x.splitAt(N/2)
Some(
if(N%2==0) {
(lower.last+upper.head)/2.0
} else {
upper.head
}
)
}
*/
// this is how to update the buffer given an input
def update(buffer : MutableAggregationBuffer, input : Row) : Unit = {
val lower = buffer(0).asInstanceOf[IndexedSeq[Double]]
val upper = buffer(1).asInstanceOf[IndexedSeq[Double]]
val value = input.getAs[Double](0)
if(lower.length == 0) {
buffer(0) = Array(value)
} else {
if(value >= lower.last) {
buffer(1) = (value +: upper).sortWith(_<_)
} else {
buffer(0) = (lower :+ value).sortWith(_<_)
}
}
val (result0,result1) = rebalance(buffer(0).asInstanceOf[IndexedSeq[Double]],buffer(1).asInstanceOf[IndexedSeq[Double]])
buffer(0) = result0
buffer(1) = result1
}
// this is how to merge two objects with the buffer schema type
def merge(buffer1 : MutableAggregationBuffer, buffer2 : Row) : Unit = {
buffer1(0) = buffer1(0).asInstanceOf[IndexedSeq[Double]] ++ buffer2(0).asInstanceOf[IndexedSeq[Double]]
buffer1(1) = buffer1(1).asInstanceOf[IndexedSeq[Double]] ++ buffer2(1).asInstanceOf[IndexedSeq[Double]]
val (result0,result1) = rebalance(buffer1(0).asInstanceOf[IndexedSeq[Double]],buffer1(1).asInstanceOf[IndexedSeq[Double]])
buffer1(0) = result0
buffer1(1) = result1
}
def evaluate(buffer: Row): Array[Option[Double]] = {
val lower =
if (buffer(0) == null) {
IndexedSeq[Double]()
} else {
buffer(0).asInstanceOf[IndexedSeq[Double]]
}
val upper =
if (buffer(1) == null) {
IndexedSeq[Double]()
} else {
buffer(1).asInstanceOf[IndexedSeq[Double]]
}
val Q1 = Some(1.0)//sorted_median(lower)
val Q2 = Some(2.0)//if(upper.length==0) { None } else { Some(upper.head) }
val Q3 = Some(3.0)//sorted_median(upper)
Array(Q1,Q2,Q3)
}
}
Then everything works, except that it doesn't compute quartiles (obviously). I don't understand the error and the rest of the stacktrace isn't any more illuminating. Could someone help me to understand what the problem is and/or how to compute these quartiles?

If you have a hive-context (or hiveSupportEnabled) you can use percentile UDAF as follows:
val dfQuartiles = df.select(
col("id"),
col("day"),
collect_list(col("value")).over(w).as("values"),
callUDF("percentile", col("value"), lit(0.25)).over(w).as("Q1"),
callUDF("percentile", col("value"), lit(0.50)).over(w).as("Q2"),
callUDF("percentile", col("value"), lit(0.75)).over(w).as("Q3"),
callUDF("percentile", col("value"), lit(1.0)).over(w).as("Q4")
)
Alternatively you can use an UDF to calculate the quartiles from values ( as you have this array anyway):
val calcPercentile = udf((xs:Seq[Int], percentile:Double) => {
val ss = xs.toSeq.sorted
val index = ((ss.size-1)*percentile).toInt
ss(index)
}
)
val dfQuartiles = df.select(
col("id"),
col("day"),
collect_list(col("value")).over(w).as("values")
)
.withColumn("Q1",calcPercentile($"values",lit(0.25)))
.withColumn("Q2",calcPercentile($"values",lit(0.50)))
.withColumn("Q3",calcPercentile($"values",lit(0.75)))
.withColumn("Q4",calcPercentile($"values",lit(1.00)))

Related

Convert Spark2.2's UDAF to 3.0 Aggregator

I have a already written UDAF in scala using Spark2.4. Since our Databricks cluster was in 6.4 runtime whose support is no more there, we need to move to 7.3 LTS which have the long term support and uses Spark3. UDAF is deprecated in Spark3 and will be removed in future(most likely). So I am trying to convert a UDAF into Aggregator function
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{IntegerType,StringType, StructField, StructType, DataType}
object MaxCampaignIdAggregator extends UserDefinedAggregateFunction with java.io.Serializable{
override def inputSchema: StructType = new StructType()
.add("id", IntegerType, true)
.add("name", StringType, true)
def bufferSchema: StructType = new StructType()
.add("id", IntegerType, true)
.add("name", StringType, true)
// Returned Data Type .
def dataType: DataType = new StructType()
.add("id", IntegerType, true)
.add("name", StringType, true)
// Self-explaining
def deterministic: Boolean = true
// This function is called whenever key changes
def initialize(buffer: MutableAggregationBuffer) = {
buffer(0) = null
buffer(1) = null
}
// Iterate over each entry of a group
def update(buffer: MutableAggregationBuffer, inputRow: Row): Unit ={
val inputId = inputRow.getAs[Int](0)
val actualInputId = inputRow.get(0)
val inputName = inputRow.getString(1)
val bufferId = buffer.getAs[Int](0)
val actualBufferId = buffer.get(0)
val bufferName = buffer.getString(1)
if(actualBufferId == null){
buffer(0) = actualInputId
buffer(1) = inputName
}else if(actualInputId != null) {
if(inputId > bufferId){
buffer(0) = inputId
buffer(1) = inputName
}
}
}
// Merge two partial aggregates
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
val buffer1Id = buffer1.getAs[Int](0)
val actualbuffer1Id = buffer1.get(0)
val buffer1Name = buffer1.getString(1)
val buffer2Id = buffer2.getAs[Int](0)
val actualbuffer2Id = buffer2.get(0)
val buffer2Name = buffer2.getString(1)
if(actualbuffer1Id == null){
buffer1(0) = actualbuffer2Id
buffer1(1) = buffer2Name
}else if(actualbuffer2Id != null){
if(buffer2Id > buffer1Id){
buffer1(0) = buffer2Id
buffer1(1) = buffer2Name
}
}
}
// Called after all the entries are exhausted.
def evaluate(buffer: Row): Any = {
Row(buffer.get(0), buffer.getString(1))
}
}
After usage this give output as :
{"id": 1282, "name": "McCormick Christmas"}
{"id": 1305, "name": "McCormick Perfect Pinch"}
{"id": 1677, "name": "Viking Cruises Viking Cruises"}

How to port UDAF to Aggregator?

I have a DF looking like this:
time,channel,value
0,foo,5
0,bar,23
100,foo,42
...
I want a DF like this:
time,foo,bar
0,5,23
100,42,...
In Spark 2, I did it with a UDAF like this:
case class ColumnBuilderUDAF(channels: Seq[String]) extends UserDefinedAggregateFunction {
#transient lazy val inputSchema: StructType = StructType {
StructField("channel", StringType, nullable = false) ::
StructField("value", DoubleType, nullable = false) ::
Nil
}
#transient lazy val bufferSchema: StructType = StructType {
channels
.toList
.indices
.map(i => StructField("c%d".format(i), DoubleType, nullable = false))
}
#transient lazy val dataType: DataType = bufferSchema
#transient lazy val deterministic: Boolean = false
def initialize(buffer: MutableAggregationBuffer): Unit = channels.indices.foreach(buffer(_) = NaN)
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val channel = input.getAs[String](0)
val p = channels.indexOf(channel)
if (p >= 0 && p < channels.length) {
val v = input.getAs[Double](1)
if (!v.isNaN) {
buffer(p) = v
}
}
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit =
channels
.indices
.foreach { i =>
val v2 = buffer2.getAs[Double](i)
if ((!v2.isNaN) && buffer1.getAs[Double](i).isNaN) {
buffer1(i) = v2
}
}
def evaluate(buffer: Row): Any =
new GenericRowWithSchema(channels.indices.map(buffer.getAs[Double]).toArray, dataType.asInstanceOf[StructType])
}
which I use like this:
val cb = ColumnBuilderUDAF(Seq("foo", "bar"))
val dfColumnar = df.groupBy($"time").agg(cb($"channel", $"value") as "c")
and then, I rename c.c0, c.c1 etc. to foo, bar etc.
In Spark 3, UDAF is deprecated and Aggregator should be used instead. So I began to port it like this:
case class ColumnBuilder(channels: Seq[String]) extends Aggregator[(String, Double), Array[Double], Row] {
lazy val bufferEncoder: Encoder[Array[Double]] = Encoders.javaSerialization[Array[Double]]
lazy val zero: Array[Double] = channels.map(_ => Double.NaN).toArray
def reduce(b: Array[Double], a: (String, Double)): Array[Double] = {
val index = channels.indexOf(a._1)
if (index >= 0 && !a._2.isNaN) b(index) = a._2
b
}
def merge(b1: Array[Double], b2: Array[Double]): Array[Double] = {
(0 until b1.length.min(b2.length)).foreach(i => if (b1(i).isNaN) b1(i) = b2(i))
b1
}
def finish(reduction: Array[Double]): Row =
new GenericRowWithSchema(reduction.map(x => x: Any), outputEncoder.schema)
def outputEncoder: Encoder[Row] = ??? // what goes here?
}
I don't know how to implement the Encoder[Row] as Spark does not have a pre-defined one. If I simply do a straightforward approach like this:
val outputEncoder: Encoder[Row] = new Encoder[Row] {
val schema: StructType = StructType(channels.map(StructField(_, DoubleType, nullable = false)))
val clsTag: ClassTag[Row] = classTag[Row]
}
I get a ClassCastException because outputEncoder actually has to be ExpressionEncoder.
So, how do I implement this correctly? Or do I still have to use the deprecated UDAF?

You can do it with the use of groupBy and pivot
import spark.implicits._
import org.apache.spark.sql.functions._
val df = Seq(
(0, "foo", 5),
(0, "bar", 23),
(100, "foo", 42)
).toDF("time", "channel", "value")
df.groupBy("time")
.pivot("channel")
.agg(first("value"))
.show(false)
Output:
+----+----+---+
|time|bar |foo|
+----+----+---+
|100 |null|42 |
|0 |23 |5 |
+----+----+---+

Explicit conversion of vector assembler to dense vector

How can I convert output of my vector assembler to a dense vector rather than sparse vector?
val featureIndexer = new VectorAssembler().setInputCols(Array("feature1","feature2","feature3")).setOutputCol("indexedFeatures")
training_set_combined = training_set_combined.na.fill(-9999)
testing_set_combined = testing_set_combined.na.fill(-9999)
// training
val assembler = new VectorAssembler().setInputCols(feature_names.toArray).setOutputCol("features")
def get_param(): mutable.HashMap[String, Any] = {
val params = new mutable.HashMap[String, Any]()
params += "eta" -> 0.1f
params += "num_round" -> 150
params += "missing" -> -999
params += "subsample" -> 1
params += "objective" -> "binary:logistic"
return params
}
val xgb = new XGBoostClassifier(get_param().toMap).setLabelCol("label").setFeaturesCol("features")
val pipeline = new Pipeline().setStages(Array(assembler, xgb))
val xgbclassifier = pipeline.fit(training_set_combined)
I am looking to convert vector assembler to dense vector

Here is the densevector transformer implementation-
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.SQLDataTypes
import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCols}
import org.apache.spark.ml.param.{ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
class DenseVectorConverter(val uid: String) extends Transformer with Params
with HasInputCols with HasOutputCols with DefaultParamsWritable {
def this() = this(Identifiable.randomUID("denseVectorConverter"))
/** #group setParam */
def setInputCols(value: Array[String]): this.type = set(inputCols, value)
/** #group setParam */
def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
def validateAndTransformSchema(schema: StructType): StructType = {
require($(inputCols).length == $(inputCols).distinct.length, s"inputCols contains" +
s" duplicates: (${$(inputCols).mkString(", ")})")
require($(outputCols).length == $(outputCols).distinct.length, s"outputCols contains" +
s" duplicates: (${$(outputCols).mkString(", ")})")
require($(inputCols).length == $(outputCols).length, s"inputCols(${$(inputCols).length})" +
s" and outputCols(${$(outputCols).length}) should have the same length")
$(inputCols).zip($(outputCols)).foldLeft(schema) { (schema, inOutCol) =>
val inputField = schema(inOutCol._1)
require(inputField.dataType == SQLDataTypes.VectorType, s"Expected dtatype of input col: ${inputField.name} as " +
s"vector but found ${inputField.dataType}")
schema.add(inOutCol._2, inputField.dataType, inputField.nullable, inputField.metadata)
}
}
def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema)
def copy(extra: ParamMap): RenameColumns = defaultCopy(extra)
override def transform(dataset: Dataset[_]): DataFrame = {
transformSchema(dataset.schema, logging = true)
val sparseToDense =
udf((v: org.apache.spark.ml.linalg.Vector) => v.toDense)
$(inputCols).zip($(outputCols)).foldLeft(dataset.toDF()) { (df, inputColOutputCol) =>
df.withColumn(inputColOutputCol._2,
sparseToDense(col(inputColOutputCol._1)));
}
}
}
object DenseVectorConverter extends DefaultParamsReadable[DenseVectorConverter] {
override def load(path: String): DenseVectorConverter = super.load(path)
}
I tested using below testcase-
import org.apache.spark.ml.linalg.Vectors
val data = Array(
Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
)
val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
df.show(false)
// +---------------------+
// |features |
// +---------------------+
// |(5,[1,3],[1.0,7.0]) |
// |[2.0,0.0,3.0,4.0,5.0]|
// |[4.0,0.0,0.0,6.0,7.0]|
// +---------------------+
val denseVectorConverter = new DenseVectorConverter()
.setInputCols(Array("features"))
.setOutputCols(Array("features_dense"))
denseVectorConverter.transform(df).show(false)
// +---------------------+---------------------+
// |features |features_dense |
// +---------------------+---------------------+
// |(5,[1,3],[1.0,7.0]) |[0.0,1.0,0.0,7.0,0.0]|
// |[2.0,0.0,3.0,4.0,5.0]|[2.0,0.0,3.0,4.0,5.0]|
// |[4.0,0.0,0.0,6.0,7.0]|[4.0,0.0,0.0,6.0,7.0]|
// +---------------------+---------------------+
Now your modified code should look like as below-
val featureIndexer = new VectorAssembler().setInputCols(Array("feature1","feature2","feature3")).setOutputCol("indexedFeatures")
training_set_combined = training_set_combined.na.fill(-9999)
testing_set_combined = testing_set_combined.na.fill(-9999)
// training
val assembler = new VectorAssembler().setInputCols(feature_names.toArray).setOutputCol("features")
val denseVectorConverter = new DenseVectorConverter()
.setInputCols(Array("features"))
.setOutputCols(Array("features_dense"))
def get_param(): mutable.HashMap[String, Any] = {
val params = new mutable.HashMap[String, Any]()
params += "eta" -> 0.1f
params += "num_round" -> 150
params += "missing" -> -999
params += "subsample" -> 1
params += "objective" -> "binary:logistic"
return params
}
val xgb = new XGBoostClassifier(get_param().toMap).setLabelCol("label").setFeaturesCol("features_dense")
val pipeline = new Pipeline().setStages(Array(assembler, denseVectorConverter, xgb))
val xgbclassifier = pipeline.fit(training_set_combined)
I've modified your code, please test it once.

scala trait members and derivated variables

Hi am trying to write a simple hill climbing algorithm in scala .
I have State and HillClimbing that are traits.
I define them as concrete classes when I apply them to the Graph problem.
In GraphHillClimbing I receive two errors. This is because I use GraphState instead of State (observe that GraphState is also a State).
How can I solve this?
trait State {
val loc = 0
def neighbours: List[State]
def get_loc():Int = return loc
}
class GraphState(loc:Int, g: Map[Int, List[Int]]) extends State {
def neighbours():List[GraphState] =
{
def neighboursAcc(l:List[Int], acc:List[GraphState], g:Map[Int, List[Int]]):List[GraphState] =
{
if(l.isEmpty) acc
else {
val new_neig = new GraphState(l.head, g)
neighboursAcc(l.tail, List(new_neig) ++ acc, g)
}
}
neighboursAcc(g(loc), List(), g)
}
}
trait HillClimbing {
val max_iteration = 4
val start:State
def cost(state:State):Double
private def argmin(costs:List[Double]):Int = {
val best = costs.min
costs.indexOf(best)
}
private def next_best(states:List[State]):State = {
val costs = states map(x => cost(x))
val pos = argmin(costs)
states(pos)
}
def minimize():State = {
def minimizeAcc(iteration:Int, state:State):State =
{
if(iteration > max_iteration) state
else {
val neigs = state.neighbours
val next_state = next_best(neigs)
minimizeAcc(iteration+1, next_state)
}
}
minimizeAcc(0, start)
}
}
class GraphHillClimbing(start:GraphState, goal:GraphState) extends HillClimbing {
// ERROR 1 = start was State and now it is GraphState
// ERROR 2 = cost should take a State
def cost(current_state:GraphState):Double = {
val distance = goal.get_loc() - current_state.get_loc()
if(distance > 0 ) distance
else -distance
}
}
object RunHillClimbing {
def main(args: Array[String]) {
val G = Map[Int, List[Int]](1->List(2, 4, 5), 2->List(1, 3, 4), 3->List(2, 6), 4->List(1, 2, 5), 5->List(1, 4), 6->List(3))
val start = new GraphState(1, G)
val goal = new GraphState(6, G)
val hc = new GraphHillClimbing(start, goal)
print(hc.minimize())
}
}

I think this can be solved using some type parameters with type bounds.
Also in your constructor for GraphHillClimbing you should use val to indicate that the parameter start is the concrete implementation of the abstract start.
trait State[+Self] {
Self =>
def loc:Int
def neighbours: List[Self]
def get_loc():Int = return loc
}
class GraphState(val loc:Int, g: Map[Int, List[Int]]) extends State[GraphState] {
def neighbours():List[GraphState] =
{
def neighboursAcc(l:List[Int], acc:List[GraphState], g:Map[Int, List[Int]]):List[GraphState] =
{
if(l.isEmpty) acc
else {
val new_neig = new GraphState(l.head, g)
neighboursAcc(l.tail, List(new_neig) ++ acc, g)
}
}
neighboursAcc(g(loc), List(), g)
}
}
trait HillClimbing[T<:State[T]] {
val max_iteration = 4
val start:T
def cost(state:T):Double
private def argmin(costs:List[Double]):Int = {
val best = costs.min
costs.indexOf(best)
}
private def next_best(states:List[T]):T = {
val costs = states map(x => cost(x))
val pos = argmin(costs)
states(pos)
}
def minimize():T = {
def minimizeAcc(iteration:Int, state:T):T =
{
if(iteration > max_iteration) state
else {
val neigs = state.neighbours
val next_state = next_best(neigs)
minimizeAcc(iteration+1, next_state)
}
}
minimizeAcc(0, start)
}
}
class GraphHillClimbing(val start:GraphState, goal:GraphState) extends HillClimbing[GraphState] {
def cost(current_state:GraphState):Double = {
val distance = goal.get_loc() - current_state.get_loc()
if(distance > 0 ) distance
else -distance
}
}
object RunHillClimbing {
def main(args: Array[String]) {
val G = Map[Int, List[Int]](1->List(2, 4, 5), 2->List(1, 3, 4), 3->List(2, 6), 4->List(1, 2, 5), 5->List(1, 4), 6->List(3))
val start = new GraphState(1, G)
val goal = new GraphState(6, G)
val hc = new GraphHillClimbing(start, goal)
print(hc.minimize())
}
}

What I get:
error: class GraphHillClimbing needs to be abstract, since:
it has 2 unimplemented members.
/** As seen from class GraphHillClimbing, the missing signatures are as follows.
* For convenience, these are usable as stub implementations.
*/
def cost(state: this.State): Double = ???
val start: this.State = ???
class GraphHillClimbing(start:GraphState, goal:GraphState) extends HillClimbing {
^
Replace GraphState in the class with State, because inheritance
demands you'll have to handle State not GraphState.
Then replace
val loc = 0
with
def loc = 0
So you can overwrite it in GraphState.

Add items to Future[List] inside recursion

I'm having an issue with Future List inside a recursion.
When i implemented this method without Futures i used ListBuffer and then adding items to the list.
val filtered = ListBuffer.empty[PostMD]
filtered ++= postMd.filter(_.fromID == userID)
Now i'm trying to implement it with Futures but i can't find a similar solution
What will be the best way to work with a Future List.
def getData(url: String, userID: String) = {
val filtered: (List[PostMD]) => Future[List[PostMD]] = Future[List[PostMD]]
def inner(url: String): Unit = {
val chunk: Future[JsValue] = BusinessLogic.Methods.getJsonValue(url)
val postMd: Future[List[PostMD]] = for {
x <- chunk.map(_.\("data").as[List[JsValue]])
y <- x.map(_.\("data").as[PostMD])
} yield y
filtered = postMd.map(_.filter(_.fromID == userID)) // <- returned Future[List[PostMD]]
val next: String = (chunk.map(_.\("paging").\("next"))).toString
if (next != null) inner(next)
}
inner(url)
filtered
}
thanks,
miki

I tried to do what you want with random number generation.
import scala.concurrent.{Await, Future}
import scala.util.Random
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
val RANDOM = new Random()
def futureRec(num: Int, f: Future[List[Integer]]): Future[List[Integer]] = {
if(num == 0) {
f
} else {
f.flatMap(l => {
futureRec(num - 1, Future.successful(RANDOM.nextInt() :: l))
})
}
}
val futureResult = futureRec(5, Future.successful(Nil))
Await.result(futureResult, 5 minutes)
So I would do, what you want something like this:
def getData(url: String, userID: String):Future[List[PostMD]] = {
def inner(url: String, f: Future[List[PostMD]]): Future[List[PostMD]] = {
val chunk: Future[JsValue] = ???
chunk.flatMap(ch => {
val postMd = (ch \ "data").\\("data").map(_.as[PostMD]).toList
val relatedPostMd = postMd.filter(_.fromID == userID)
val next: String = (ch.\("paging").\("next")).as[String]
if (next != null)
inner(next, f.map(l => l ++ relatedPostMd))
else
f.map(l => l ++ relatedPostMd)
})
}
inner(url, Future.successful(Nil))
}

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Computing Quartiles over Windowed Dataframe - scala

Related

Convert Spark2.2's UDAF to 3.0 Aggregator

How to port UDAF to Aggregator?

Explicit conversion of vector assembler to dense vector

scala trait members and derivated variables

Add items to Future[List] inside recursion

Categories

Resources