Spark rolling window with multiple occurrence in same date - pyspark

Input:
Request:
Would like to calculate 3 months rolling sum, avg across time.
But, there are two rows for "2022-07-01". Would like to get result for both row.
Expected Output:

If you can use rowsBetween instead of rangeBetween, you can assign a row number to each of the dates in an order and then use that row number in the sum and avg window.
Below is an example.
data_sdf. \
withColumn('rn', func.row_number().over(wd.partitionBy().orderBy('ym'))). \
withColumn('sum_val_roll3m',
func.sum('val').over(wd.partitionBy().orderBy('rn').rowsBetween(-2, 0))
). \
withColumn('mean_val_roll3m',
func.avg('val').over(wd.partitionBy().orderBy('rn').rowsBetween(-2, 0))
). \
show()
# +----------+---+---+--------------+-----------------+
# | ym|val| rn|sum_val_roll3m| mean_val_roll3m|
# +----------+---+---+--------------+-----------------+
# |2022-03-01| 8| 1| 8| 8.0|
# |2022-04-01| 7| 2| 15| 7.5|
# |2022-05-01| 7| 3| 22|7.333333333333333|
# |2022-06-01| 10| 4| 24| 8.0|
# |2022-07-01| 4| 5| 21| 7.0|
# |2022-07-01| 1| 6| 15| 5.0|
# +----------+---+---+--------------+-----------------+

Related

Compare sum of values between two specific date ranges over different categories

I'm working in databricks. I have the following dataframe:
+----------+---+-----+
| date|cat|value|
+----------+---+-----+
|2022-08-11| a| 1|
|2022-08-12| a| 1|
|2022-08-13| a| 1|
|2022-08-14| a| 1|
|2022-08-15| a| 1|
|2022-08-16| a| 1|
|2022-08-17| a| 2|
|2022-08-18| a| 2|
|2022-08-19| a| 2|
|2022-08-20| a| 2|
|2022-08-21| a| 2|
|2022-08-22| a| 2|
|2022-08-11| b| 1|
|2022-08-12| b| 1|
|2022-08-13| b| 1|
|2022-08-14| b| 1|
|2022-08-15| b| 1|
|2022-08-16| b| 1|
|2022-08-17| b| 3|
|2022-08-18| b| 3|
|2022-08-19| b| 3|
|2022-08-20| b| 3|
|2022-08-21| b| 3|
|2022-08-22| b| 3|
+----------+---+-----+
I want to be able to compare the sum of the values between the 17 and the 22 (week1) and between the 11 and the 16 (week2). Start end and end date of each period are predefined.
So far I've tried something like this:
w = (Window.partitionBy('cat'))
df = (df
.withColumn('date', f.to_date('date', 'yyyy-MM-dd'))
.withColumn('value_week_1',
f.when(
(f.col('date') >= '2022-08-17') &
(f.col('date') <= '2022-08-22'),
f.sum('value').over(w)
)
)
.withColumn('value_week_2',
f.when(
(f.col('date') >= '2022-08-11') &
(f.col('date') <= '2022-08-16'),
f.sum('value').over(w)
)
)
)
but It doesn't work and I'm not sure I'm going in the right direction.
Ultimately I'd like to have something like this:
+----------+---+-----+----+------+--------+
| date|cat|value| w1| w2| diff|
+----------+---+-----+----+------+--------+
|2022-08-11| a| 1| 6| 12| 6|
|2022-08-12| a| 1| 6| 12| 6|
|2022-08-13| a| 1| 6| 12| 6|
|2022-08-14| a| 1| 6| 12| 6|
|2022-08-15| a| 1| 6| 12| 6|
|2022-08-16| a| 1| 6| 12| 6|
|2022-08-17| a| 2| 6| 12| 6|
|2022-08-18| a| 2| 6| 12| 6|
|2022-08-19| a| 2| 6| 12| 6|
|2022-08-20| a| 2| 6| 12| 6|
|2022-08-21| a| 2| 6| 12| 6|
|2022-08-22| a| 2| 6| 12| 6|
|2022-08-11| b| 3| 18| 30| 12|
|2022-08-12| b| 3| 18| 30| 12|
|2022-08-13| b| 3| 18| 30| 12|
|2022-08-14| b| 3| 18| 30| 12|
|2022-08-15| b| 3| 18| 30| 12|
|2022-08-16| b| 3| 18| 30| 12|
|2022-08-17| b| 5| 18| 30| 12|
|2022-08-18| b| 5| 18| 30| 12|
|2022-08-19| b| 5| 18| 30| 12|
|2022-08-20| b| 5| 18| 30| 12|
|2022-08-21| b| 5| 18| 30| 12|
|2022-08-22| b| 5| 18| 30| 12|
+----------+---+-----+----+------+--------+
I think we don't need to use window in your case, we can just:
df_agg = df\
.withColumn('week', func.when((func.col('date')>='2022-08-17')&(func.col('date')<='2022-08-22'), func.lit('w1')).otherwise(func.lit('w2')))\
.groupby('cat').pivot('week')\
.agg(func.sum('value'))\
.withColumn('diff', func.col('w2')-func.col('w1'))
We can just create a new column called week to see if the date is under which week, then create a pivot table.
w =Window.partitionBy('cat').orderBy('cat')
df1 = (
#Create week column to help partion. Use row number to create cululative day count. Find each 7th day using pytho's modulo
df.withColumn('wk',(~(row_number().over(w)%7>0)).cast('int')).withColumn('wk',F.sum('wk').over(Window.partitionBy('cat').orderBy().rowsBetween(-sys.maxsize, 0))+1)
#Finfd the cumulative sum per group per week
.groupby('cat','wk').agg(F.collect_list('date').alias('date'),F.sum('value').alias('value')).withColumn('date', explode('date'))
# #Put the total sum in an array in preparation for pivot
.withColumn('value_1', F.collect_set('value').over(Window.partitionBy('cat').orderBy('date','value').rowsBetween(-sys.maxsize, sys.maxsize)))
# #pivot and create week columns
.withColumn('wk',F.array(F.struct(*[F.col('value_1')[i].alias(f"week_{i+1}")for i in range(2)]))).selectExpr('*','inline(wk)').drop('wk','value_1')
# #Find the difference
.withColumn('diff', abs(col('week_1')-col('week_2')))
).show()
There are somethings about this problem that do not make sense. Please see end of article for my observations.
First, the dates from 8/11 to 8/16 do not make up a whole week. Second, the labels of week-1 being 8/17 to 8/22 and week-2 being 8/11 to 8/16 are logically backwards.
I am going to solve this problem using PySpark and Spark SQL since it is straight forward.
#
# Create sample data
#
dat1 = [
("2022-08-11","a",1),
("2022-08-12","a",1),
("2022-08-13","a",1),
("2022-08-14","a",1),
("2022-08-15","a",1),
("2022-08-16","a",1),
("2022-08-17","a",2),
("2022-08-18","a",2),
("2022-08-19","a",2),
("2022-08-20","a",2),
("2022-08-21","a",2),
("2022-08-22","a",2),
("2022-08-11","b",1),
("2022-08-12","b",1),
("2022-08-13","b",1),
("2022-08-14","b",1),
("2022-08-15","b",1),
("2022-08-16","b",1),
("2022-08-17","b",3),
("2022-08-18","b",3),
("2022-08-19","b",3),
("2022-08-20","b",3),
("2022-08-21","b",3),
("2022-08-22","b",3)
]
col1 = ["date", "cat", "value"]
df1 = spark.createDataFrame(data=dat1, schema=col1)
df1.createOrReplaceTempView("sample_data")
The above code create a temporary view with the data set.
#
# Core data - add category w0
#
stmt = """
select
date,
cat,
value,
case
when date >= "2022-08-11" and date <= "2022-08-16" then 2
when date >= "2022-08-17" and date <= "2022-08-22" then 1
else 0
end as w0
from sample_data as q1
"""
df2 = spark.sql(stmt)
df2.createOrReplaceTempView("core_data")
The code above labels the data as week-1 or week-2 and save this category information as w0. This could have been hard coded into the dataset above.
#
# Pivot data - sum vaule by cat, pivot on w0
#
stmt = """
select * from
(
select cat, w0, value from core_data
)
pivot (
cast(sum(value) as DECIMAL(4, 2)) as total
for w0 in (1 w1, 2 w2)
)
"""
df3 = spark.sql(stmt)
df3.createOrReplaceTempView("pivot_data")
The code above creates a column per week category and summarizes the values.
Please note, the result set has 3/5 for cat = b while the original data set has 1/3. I am using your original data set.
Last but not least, we join the core_data to the pivot_data and create a calculated column of the difference of (w1-w2).
You can use spark.sql() to create a dataframe and save this result as a file if you want.
To recap, the length of the week categories is not 7 days, labeling a prior week a greater number than the current does not make sense, and the expected result set is wrong in your example since the input set has different numbers.
In short, working with temporary views allows you to leverage your existing T-SQL skills.

Is there any method by which we can limit the rows in repartition function?

In spark I am trying to limit the numbers of rows to 100 in each partition. But i don't want to write it in the file.. i need to perform more operations on the file before overwriting the records
you can do it using repartition.
to keep n record in each partition you need to repartition your data as total_data_count/repartition=100
For example : i have 100 record now if i want to have each partition 10 records then i have to repartition my data in 10 parts df.repartition(10)
>>> df=spark.read.csv("/path to csv/sample2.csv",header=True)
>>> df.count()
100
>>> df1=df.repartition(10)
>>> df1\
... .withColumn("partitionId", spark_partition_id())\
... .groupBy("partitionId")\
... .count()\
... .orderBy(asc("count"))\
... .show()
+-----------+-----+
|partitionId|count|
+-----------+-----+
| 6| 10|
| 3| 10|
| 5| 10|
| 9| 10|
| 8| 10|
| 4| 10|
| 7| 10|
| 1| 10|
| 0| 10|
| 2| 10|
+-----------+-----+
here you can see each partition have 10 records

Spark SQL find min value in column and get whole row

I have a table like below and I want to get row where distance in min in spark sql
I tried this
result.select($"sourceBorder", $"targetBorder", $"min(distance))").show()
which gives error, and result.agg(min("distance")) only gives the distance column not others.
+------------+------------+--------+
|sourceBorder|targetBorder|distance|
+------------+------------+--------+
| 3| 12| 20|
| 4| 12| 28|
| 2| 12| 16|
| 3| 6| 15|
| 4| 6| 19|
| 2| 6| 7|
| 3| 7| 15|
| 4| 7| 23|
| 2| 7| 11|
+------------+------------+--------+
so at the end want this row
| 2| 6| 7|
Add a column of minimum distance, and filter the rows where distance = minimum distance:
result.withColumn(
"mindistance",
min($"distance").over(Window.orderBy("distance"))
).filter($"distance" === $"mindistance")

Pyspark - add missing values per key?

I have a Pyspark dataframe with some non-unique key key and some columns number and value.
For most keys, the number column goes from 1 to 12, but for some of them, there are gaps in numbers (for ex. we have numbers [1, 2, 5, 9]). I would like to add missing rows, so that for every key we have all the numbers in range 1-12 populated with the last seen value.
So that for table
key number value
a 1 6
a 2 10
a 5 20
a 9 25
I would like to get
key number value
a 1 6
a 2 10
a 3 10
a 4 10
a 5 20
a 6 20
a 7 20
a 8 20
a 9 25
a 10 25
a 11 25
a 12 25
I thought about creating a table of a and an array of 1-12, exploding the array and joining with my original table, then separately populating the value column with previous value using a window function bounded by current row. However, it seems a bit inelegant and I wonder if there is a better way to achieve what I want?
I thought about creating a table of a and an array of 1-12, exploding the array and joining with my original table, then separately populating the value column with previous value using a window function bounded by current row. However, it seems a bit inelegant and I wonder if there is a better way to achieve what I want?
I do not think your proposed approach is inelegant - but you can achieve the same using range instead of explode.
First create a dataframe with all the numbers in your range. You will also want to cross join this with the distinct key column from your DataFrame.
all_numbers = spark.range(1, 13).withColumnRenamed("id", "number")
all_numbers = all_numbers.crossJoin(df.select("key").distinct()).cache()
all_numbers.show()
#+------+---+
#|number|key|
#+------+---+
#| 1| a|
#| 2| a|
#| 3| a|
#| 4| a|
#| 5| a|
#| 6| a|
#| 7| a|
#| 8| a|
#| 9| a|
#| 10| a|
#| 11| a|
#| 12| a|
#+------+---+
Now you can outer join this to your original DataFrame and forward fill using the last known good value. If the number of keys is small enough, you may be able to broadcast
from pyspark.sql.functions import broadcast, last
from pyspark.sql import Window
df.join(broadcast(all_numbers), on=["number", "key"], how="outer")\
.withColumn(
"value",
last(
"value",
ignorenulls=True
).over(
Window.partitionBy("key").orderBy("number")\
.rowsBetween(Window.unboundedPreceding, 0)
)
)\
.show()
#+------+---+-----+
#|number|key|value|
#+------+---+-----+
#| 1| a| 6|
#| 2| a| 10|
#| 3| a| 10|
#| 4| a| 10|
#| 5| a| 20|
#| 6| a| 20|
#| 7| a| 20|
#| 8| a| 20|
#| 9| a| 25|
#| 10| a| 25|
#| 11| a| 25|
#| 12| a| 25|
#+------+---+-----+
You could do this without join. I have done multiple tests on this with different gaps and it will always work as long as number 1 is always provided as input(as you need sequence to start from there), and it will always range till 12. I used a couple windows to get a column which I could use in the sequence, then made a custom sequence using expressions, and then exploded it to get desired result. If for some reason, you will have inputs that do not have number 1 in there, let me know I will update my solution.
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import when
w=Window().partitionBy("key").orderBy("number")
w2=Window().partitionBy("key").orderBy("number").rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
df.withColumn("number2", F.lag("number").over(w)).withColumn("diff", F.when((F.col("number2").isNotNull()) & ((F.col("number")-F.col("number2")) > 1), (F.col("number")-F.col("number2"))).otherwise(F.lit(0)))\
.withColumn("diff2", F.lead("diff").over(w)).withColumn("diff2", F.when(F.col("diff2").isNull(), F.lit(0)).otherwise(F.col("diff2"))).withColumn("diff2", F.when(F.col("diff2")!=0, F.col("diff2")-1).otherwise(F.col("diff2"))).withColumn("max", F.max("number").over(w2))\
.withColumn("diff2", F.when((F.col("number")==F.col("max")) & (F.col("number")<F.lit(12)), F.lit(12)-F.col("number")).otherwise(F.col("diff2")))\
.withColumn("number2", F.when(F.col("diff2")!=0,F.expr("""sequence(number,number+diff2,1)""")).otherwise(F.expr("""sequence(number,number+diff2,0)""")))\
.drop("diff","diff2","max")\
.withColumn("number2", F.explode("number2")).drop("number")\
.select("key", F.col("number2").alias("number"), "value")\
.show()
+---+------+-----+
|key|number|value|
+---+------+-----+
| a| 1| 6|
| a| 2| 10|
| a| 3| 10|
| a| 4| 10|
| a| 5| 20|
| a| 6| 20|
| a| 7| 20|
| a| 8| 20|
| a| 9| 25|
| a| 10| 25|
| a| 11| 25|
| a| 12| 25|
+---+------+-----+

Spark Dataframe maximum on Several Columns of a Group

How can I get the maximum value for different (string and numerical) types of columns in a DataFrame in Scala using Spark?
Let say that is my data
+----+-----+-------+------+
|name|value1|value2|string|
+----+-----+-------+------+
| A| 7| 9| "a"|
| A| 1| 10| null|
| B| 4| 4| "b"|
| B| 3| 6| null|
+----+-----+-------+------+
and the desired outcome is:
+----+-----+-------+------+
|name|value1|value2|string|
+----+-----+-------+------+
| A| 7| 10| "a"|
| B| 4| 6| "b"|
+----+-----+-------+------+
Is there a function like in pandas with apply(max,axis=0) or do I have to write a UDF?
What I can do is a df.groupBy("name").max("value1") but I canot perform two max in a row neither does a Sequence work in max() function.
Any ideas to solve the problem quickly?
Use this
df.groupBy("name").agg(max("value1"), max("value2"))