Get total row count over a window - pyspark

In PySpark, would it be possible to obtain the total number of rows in a particular window?
Right now I am using:
w = Window.partitionBy("column_to_partition_by")
F.count(col("column_1")).over(w)
However, this only gives me the incremental row count. What I need is the total number of rows in that particular window partition. Can anyone tell me the command for this?

I think you need to add rowsBetween with your window clause.
Example:
df.show()
#+---+---+
#| i| j|
#+---+---+
#| 1| a|
#| 1| b|
#| 1| c|
#| 2| c|
#+---+---+
w = Window.partitionBy("i").rowsBetween(-sys.maxsize,sys.maxsize)
df.withColumn("count",count(col("j")).over(w)).show()
#+---+---+-----+
#| i| j|count|
#+---+---+-----+
#| 1| a| 3|
#| 1| b| 3|
#| 1| c| 3|
#| 2| c| 1|
#+---+---+-----+
Usually when we have .orderBy clause to window then we need to have rowsBetween needs to be added, as orderby clause defaults to unboundedPreceeding and currentRow.
w = Window.partitionBy("i").orderBy("j")
df.withColumn("count",count(col("j")).over(w)).show()
#incremental count
#+---+---+-----+
#| i| j|count|
#+---+---+-----+
#| 1| a| 1|
#| 1| b| 2|
#| 1| c| 3|
#| 2| c| 1|
#+---+---+-----+
w = Window.partitionBy("i").orderBy("j").rowsBetween(-sys.maxsize,sys.maxsize)
df.withColumn("count",count(col("j")).over(w)).show()
#total number of rows count
#+---+---+-----+
#| i| j|count|
#+---+---+-----+
#| 1| a| 3|
#| 1| b| 3|
#| 1| c| 3|
#| 2| c| 1|
#+---+---+-----+

Related

Compare sum of values between two specific date ranges over different categories

I'm working in databricks. I have the following dataframe:
+----------+---+-----+
| date|cat|value|
+----------+---+-----+
|2022-08-11| a| 1|
|2022-08-12| a| 1|
|2022-08-13| a| 1|
|2022-08-14| a| 1|
|2022-08-15| a| 1|
|2022-08-16| a| 1|
|2022-08-17| a| 2|
|2022-08-18| a| 2|
|2022-08-19| a| 2|
|2022-08-20| a| 2|
|2022-08-21| a| 2|
|2022-08-22| a| 2|
|2022-08-11| b| 1|
|2022-08-12| b| 1|
|2022-08-13| b| 1|
|2022-08-14| b| 1|
|2022-08-15| b| 1|
|2022-08-16| b| 1|
|2022-08-17| b| 3|
|2022-08-18| b| 3|
|2022-08-19| b| 3|
|2022-08-20| b| 3|
|2022-08-21| b| 3|
|2022-08-22| b| 3|
+----------+---+-----+
I want to be able to compare the sum of the values between the 17 and the 22 (week1) and between the 11 and the 16 (week2). Start end and end date of each period are predefined.
So far I've tried something like this:
w = (Window.partitionBy('cat'))
df = (df
.withColumn('date', f.to_date('date', 'yyyy-MM-dd'))
.withColumn('value_week_1',
f.when(
(f.col('date') >= '2022-08-17') &
(f.col('date') <= '2022-08-22'),
f.sum('value').over(w)
)
)
.withColumn('value_week_2',
f.when(
(f.col('date') >= '2022-08-11') &
(f.col('date') <= '2022-08-16'),
f.sum('value').over(w)
)
)
)
but It doesn't work and I'm not sure I'm going in the right direction.
Ultimately I'd like to have something like this:
+----------+---+-----+----+------+--------+
| date|cat|value| w1| w2| diff|
+----------+---+-----+----+------+--------+
|2022-08-11| a| 1| 6| 12| 6|
|2022-08-12| a| 1| 6| 12| 6|
|2022-08-13| a| 1| 6| 12| 6|
|2022-08-14| a| 1| 6| 12| 6|
|2022-08-15| a| 1| 6| 12| 6|
|2022-08-16| a| 1| 6| 12| 6|
|2022-08-17| a| 2| 6| 12| 6|
|2022-08-18| a| 2| 6| 12| 6|
|2022-08-19| a| 2| 6| 12| 6|
|2022-08-20| a| 2| 6| 12| 6|
|2022-08-21| a| 2| 6| 12| 6|
|2022-08-22| a| 2| 6| 12| 6|
|2022-08-11| b| 3| 18| 30| 12|
|2022-08-12| b| 3| 18| 30| 12|
|2022-08-13| b| 3| 18| 30| 12|
|2022-08-14| b| 3| 18| 30| 12|
|2022-08-15| b| 3| 18| 30| 12|
|2022-08-16| b| 3| 18| 30| 12|
|2022-08-17| b| 5| 18| 30| 12|
|2022-08-18| b| 5| 18| 30| 12|
|2022-08-19| b| 5| 18| 30| 12|
|2022-08-20| b| 5| 18| 30| 12|
|2022-08-21| b| 5| 18| 30| 12|
|2022-08-22| b| 5| 18| 30| 12|
+----------+---+-----+----+------+--------+
I think we don't need to use window in your case, we can just:
df_agg = df\
.withColumn('week', func.when((func.col('date')>='2022-08-17')&(func.col('date')<='2022-08-22'), func.lit('w1')).otherwise(func.lit('w2')))\
.groupby('cat').pivot('week')\
.agg(func.sum('value'))\
.withColumn('diff', func.col('w2')-func.col('w1'))
We can just create a new column called week to see if the date is under which week, then create a pivot table.
w =Window.partitionBy('cat').orderBy('cat')
df1 = (
#Create week column to help partion. Use row number to create cululative day count. Find each 7th day using pytho's modulo
df.withColumn('wk',(~(row_number().over(w)%7>0)).cast('int')).withColumn('wk',F.sum('wk').over(Window.partitionBy('cat').orderBy().rowsBetween(-sys.maxsize, 0))+1)
#Finfd the cumulative sum per group per week
.groupby('cat','wk').agg(F.collect_list('date').alias('date'),F.sum('value').alias('value')).withColumn('date', explode('date'))
# #Put the total sum in an array in preparation for pivot
.withColumn('value_1', F.collect_set('value').over(Window.partitionBy('cat').orderBy('date','value').rowsBetween(-sys.maxsize, sys.maxsize)))
# #pivot and create week columns
.withColumn('wk',F.array(F.struct(*[F.col('value_1')[i].alias(f"week_{i+1}")for i in range(2)]))).selectExpr('*','inline(wk)').drop('wk','value_1')
# #Find the difference
.withColumn('diff', abs(col('week_1')-col('week_2')))
).show()
There are somethings about this problem that do not make sense. Please see end of article for my observations.
First, the dates from 8/11 to 8/16 do not make up a whole week. Second, the labels of week-1 being 8/17 to 8/22 and week-2 being 8/11 to 8/16 are logically backwards.
I am going to solve this problem using PySpark and Spark SQL since it is straight forward.
#
# Create sample data
#
dat1 = [
("2022-08-11","a",1),
("2022-08-12","a",1),
("2022-08-13","a",1),
("2022-08-14","a",1),
("2022-08-15","a",1),
("2022-08-16","a",1),
("2022-08-17","a",2),
("2022-08-18","a",2),
("2022-08-19","a",2),
("2022-08-20","a",2),
("2022-08-21","a",2),
("2022-08-22","a",2),
("2022-08-11","b",1),
("2022-08-12","b",1),
("2022-08-13","b",1),
("2022-08-14","b",1),
("2022-08-15","b",1),
("2022-08-16","b",1),
("2022-08-17","b",3),
("2022-08-18","b",3),
("2022-08-19","b",3),
("2022-08-20","b",3),
("2022-08-21","b",3),
("2022-08-22","b",3)
]
col1 = ["date", "cat", "value"]
df1 = spark.createDataFrame(data=dat1, schema=col1)
df1.createOrReplaceTempView("sample_data")
The above code create a temporary view with the data set.
#
# Core data - add category w0
#
stmt = """
select
date,
cat,
value,
case
when date >= "2022-08-11" and date <= "2022-08-16" then 2
when date >= "2022-08-17" and date <= "2022-08-22" then 1
else 0
end as w0
from sample_data as q1
"""
df2 = spark.sql(stmt)
df2.createOrReplaceTempView("core_data")
The code above labels the data as week-1 or week-2 and save this category information as w0. This could have been hard coded into the dataset above.
#
# Pivot data - sum vaule by cat, pivot on w0
#
stmt = """
select * from
(
select cat, w0, value from core_data
)
pivot (
cast(sum(value) as DECIMAL(4, 2)) as total
for w0 in (1 w1, 2 w2)
)
"""
df3 = spark.sql(stmt)
df3.createOrReplaceTempView("pivot_data")
The code above creates a column per week category and summarizes the values.
Please note, the result set has 3/5 for cat = b while the original data set has 1/3. I am using your original data set.
Last but not least, we join the core_data to the pivot_data and create a calculated column of the difference of (w1-w2).
You can use spark.sql() to create a dataframe and save this result as a file if you want.
To recap, the length of the week categories is not 7 days, labeling a prior week a greater number than the current does not make sense, and the expected result set is wrong in your example since the input set has different numbers.
In short, working with temporary views allows you to leverage your existing T-SQL skills.

create a new column to increment value when value resets to 1 in another column in pyspark

Logic and columnIn Pyspark DataFrame consider a column like [1,2,3,4,1,2,1,1,2,3,1,2,1,1,2]. Pyspark Column
create a new column to increment value when value resets to 1.
Expected output is[1,1,1,1,2,2,3,4,4,4,5,5,6,7,7]
i am bit new to pyspark, if anyone can help me it would be great for me.
written the logic as like below
def sequence(row_num):
results = [1, ]
flag = 1
for col in range(0, len(row_num)-1):
if row_num[col][0]>=row_num[col+1][0]:
flag+=1
results.append(flag)
return results
but not able to pass a column through udf. please help me in this
Your Dataframe:
df = spark.createDataFrame(
[
('1','a'),
('2','b'),
('3','c'),
('4','d'),
('1','e'),
('2','f'),
('1','g'),
('1','h'),
('2','i'),
('3','j'),
('1','k'),
('2','l'),
('1','m'),
('1','n'),
('2','o')
], ['group','label']
)
+-----+-----+
|group|label|
+-----+-----+
| 1| a|
| 2| b|
| 3| c|
| 4| d|
| 1| e|
| 2| f|
| 1| g|
| 1| h|
| 2| i|
| 3| j|
| 1| k|
| 2| l|
| 1| m|
| 1| n|
| 2| o|
+-----+-----+
You can create a flag and use a Window Function to calculate the cumulative sum. No need to use an UDF:
from pyspark.sql import Window as W
from pyspark.sql import functions as F
w = W.partitionBy().orderBy('label').rowsBetween(Window.unboundedPreceding, 0)
df\
.withColumn('Flag', F.when(F.col('group') == 1, 1).otherwise(0))\
.withColumn('Output', F.sum('Flag').over(w))\
.show()
+-----+-----+----+------+
|group|label|Flag|Output|
+-----+-----+----+------+
| 1| a| 1| 1|
| 2| b| 0| 1|
| 3| c| 0| 1|
| 4| d| 0| 1|
| 1| e| 1| 2|
| 2| f| 0| 2|
| 1| g| 1| 3|
| 1| h| 1| 4|
| 2| i| 0| 4|
| 3| j| 0| 4|
| 1| k| 1| 5|
| 2| l| 0| 5|
| 1| m| 1| 6|
| 1| n| 1| 7|
| 2| o| 0| 7|
+-----+-----+----+------+

pyspark: Auto filling in implicit missing values

I have a dataframe
user day amount
a 2 10
a 1 14
a 4 5
b 1 4
You see that, the maximum value of day is 4, and the minimum value is 1. I want to fill 0 for amount column in all missing days of all users, so the above data frame will become.
user day amount
a 2 10
a 1 14
a 4 5
a 3 0
b 1 4
b 2 0
b 3 0
b 4 0
How could I do that in PySpark? Many thanks.
Here is one approach. You can get the min and max values first , then group on user column and pivot, then fill in missing columns and fill all nulls as 0, then stack them back:
min_max = df.agg(F.min("day"),F.max("day")).collect()[0]
df1 = df.groupBy("user").pivot("day").agg(F.first("amount").alias("amount")).na.fill(0)
missing_cols = [F.lit(0).alias(str(i)) for i in range(min_max[0],min_max[1]+1)
if str(i) not in df1.columns ]
df1 = df1.select("*",*missing_cols)
#+----+---+---+---+---+
#|user| 1| 2| 4| 3|
#+----+---+---+---+---+
#| b| 4| 0| 0| 0|
#| a| 14| 10| 5| 0|
#+----+---+---+---+---+
#the next step is inspired from https://stackoverflow.com/a/37865645/9840637
arr = F.explode(F.array([F.struct(F.lit(c).alias("day"), F.col(c).alias("amount"))
for c in df1.columns[1:]])).alias("kvs")
(df1.select(["user"] + [arr])
.select(["user"]+ ["kvs.day", "kvs.amount"]).orderBy("user")).show()
+----+---+------+
|user|day|amount|
+----+---+------+
| a| 1| 14|
| a| 2| 10|
| a| 4| 5|
| a| 3| 0|
| b| 1| 4|
| b| 2| 0|
| b| 4| 0|
| b| 3| 0|
+----+---+------+
Note, since column day was pivotted , the dtype might have changed so you may have to cast them back to the original dtype
Another way to do this is to use sequence, array functions and explode. (spark2.4+)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w=Window().partitionBy(F.lit(0))
df.withColumn("boundaries", F.sequence(F.min("day").over(w),F.max("day").over(w),F.lit(1)))\
.groupBy("user").agg(F.collect_list("day").alias('day'),F.collect_list("amount").alias('amount')\
,F.first("boundaries").alias("boundaries")).withColumn("boundaries", F.array_except("boundaries","day"))\
.withColumn("day",F.flatten(F.array("day","boundaries"))).drop("boundaries")\
.withColumn("zip", F.explode(F.arrays_zip("day","amount")))\
.select("user","zip.day", F.when(F.col("zip.amount").isNull(),\
F.lit(0)).otherwise(F.col("zip.amount")).alias("amount")).show()
#+----+---+------+
#|user|day|amount|
#+----+---+------+
#| a| 2| 10|
#| a| 1| 14|
#| a| 4| 5|
#| a| 3| 0|
#| b| 1| 4|
#| b| 2| 0|
#| b| 3| 0|
#| b| 4| 0|
#+----+---+------+

using spark sql for conditional lag summation

Below is the dataframe i have
df = sqlContext.createDataFrame(
[("0", "0"), ("1", "2"), ("2", "3"), ("3", "4"), ("4", "0"), ("5", "5"), ("6", "5")],
["id", "value"])
+---+-----+
| id|value|
+---+-----+
| 0| 0|
| 1| 2|
| 2| 3|
| 3| 4|
| 4| 0|
| 5| 5|
| 6| 5|
+---+-----+
And what I want to get is :
+---+-----+---+-----+
| id|value|masterid|partsum|
+---+-----|---+-----+
| 0| 0| 0| 0|
| 1| 2| 0| 2|
| 2| 3| 0| 5|
| 3| 4| 0| 9|
| 4| 0| 4| 0|
| 5| 5| 4| 5|
| 6| 5| 4| 10|
+---+-----+---+-----+
So I try to use SparkSQL to do so:
df=df.withColumn("masterid", F.when( df.value !=0 , F.lag(df.id)).otherwise(df.id))
I original thought the lag function can help me process before next iteration so as to get the masterid col. Unfortunately, after i check the manual , it cant help.
So , i would like to ask if there are any special functions i could use to do what i want? Or is there any "conditional lag" function i could use? so that, when i see non-zero item, i can use lag until find a zero number?
IIUC, you can try defining a sub-group label (g in the below code) and two Window Specs:
from pyspark.sql import Window, functions as F
w1 = Window.orderBy('id')
w2 = Window.partitionBy('g').orderBy('id')
df.withColumn('g', F.sum(F.expr('if(value=0,1,0)')).over(w1)).select(
'id'
, 'value'
, F.first('id').over(w2).alias('masterid')
, F.sum('value').over(w2).alias('partsum')
).show()
#+---+-----+--------+-------+
#| id|value|masterid|partsum|
#+---+-----+--------+-------+
#| 0| 0| 0| 0.0|
#| 1| 2| 0| 2.0|
#| 2| 3| 0| 5.0|
#| 3| 4| 0| 9.0|
#| 4| 0| 4| 0.0|
#| 5| 5| 4| 5.0|
#| 6| 5| 4| 10.0|
#+---+-----+--------+-------+

Determining Number of Joint Sessions Per Product Pair

I have this data-frame:
from pyspark.mllib.linalg.distributed import IndexedRow
rows = sc.parallelize([[1, "A"], [1, 'B'] , [1, "A"], [2, 'A'], [2, 'C'] ,[3,'A'], [3, 'B']])
rows_df = rows.toDF(["session_id", "product"])
rows_df.show()
+----------+-------+
|session_id|product|
+----------+-------+
| 1| A|
| 1| B|
| 1| A|
| 2| A|
| 2| C|
| 3| A|
| 3| B|
+----------+-------+
I want to know how many joint sessions each product pair have together. The same products can be in a session multiple times, but I only want one count per session per product pair.
Sample Output:
+---------+---------+-----------------+
|product_a|product_b|num_join_sessions|
+---------+---------+-----------------+
| A| B| 2|
| A| C| 1|
| B| A| 2|
| B| C| 0|
| C| A| 1|
| C| B| 0|
+---------+---------+-----------------+
I'm lost on how to implement this in pyspark.
Getting the joint session count for pairs that have joint sessions is fairly easy. You can achieve this by joining the DataFrame to itself on session_id and filtering out the rows where the products are the same.
Then you group by the product pairs and count the distinct session_ids.
import pyspark.sql.functions as f
rows_df.alias("l").join(rows_df.alias("r"), on="session_id", how="inner")\
.where("l.product != r.product")\
.groupBy(f.col("l.product").alias("product_a"), f.col("r.product").alias("product_b"))\
.agg(f.countDistinct("session_id").alias("num_join_sessions"))\
.show()
#+---------+---------+-----------------+
#|product_a|product_b|num_join_sessions|
#+---------+---------+-----------------+
#| A| C| 1|
#| C| A| 1|
#| B| A| 2|
#| A| B| 2|
#+---------+---------+-----------------+
(Side note: if want ONLY unique pairs of products, change the != to < in the where function).
The tricky part is that you also want the pairs that don't have joint sessions. This can be done, but it won't be efficient because you will need to get a Cartesian product of every product pairing.
Nevertheless, here is one approach:
Start with the above and RIGHT join in the Cartesian product of the distinct products pairs.
rows_df.alias("l").join(rows_df.alias("r"), on="session_id", how="inner")\
.where("l.product != r.product")\
.groupBy(f.col("l.product").alias("product_a"), f.col("r.product").alias("product_b"))\
.agg(f.countDistinct("session_id").alias("num_join_sessions"))\
.join(
rows_df.selectExpr("product AS product_a").distinct().crossJoin(
rows_df.selectExpr("product AS product_b").distinct()
).where("product_a != product_b").alias("pairs"),
on=["product_a", "product_b"],
how="right"
)\
.fillna(0)\
.sort("product_a", "product_b")\
.show()
#+---------+---------+-----------------+
#|product_a|product_b|num_join_sessions|
#+---------+---------+-----------------+
#| A| B| 2|
#| A| C| 1|
#| B| A| 2|
#| B| C| 0|
#| C| A| 1|
#| C| B| 0|
#+---------+---------+-----------------+
Note: the sort is not needed, but I included it to match the order of the desired output.
I believe this should do it:
import pyspark.sql.functions as F
joint_sessions = rows_df.withColumnRenamed(
'product', 'product_a'
).join(
rows_df.withColumnRenamed('product', 'product_b'),
on='session_id',
how='inner'
).filter(
F.col('product_a') != F.col('product_b')
).groupBy(
'product_a',
'product_b'
).agg(
F.countDistinct('session_id').alias('num_join_sessions')
).select(
'product_a',
'product_b',
'num_join_sessions'
)
joint_sessions.show()