How to find coefficient of variation (CV) in pyspark? - pyspark

I have 2 pyspark dataframe i want to find coefficient of variation of that two dataframe.
dataframe1 :
hours total
00 75969.0
01 75302.0
02 74636.0
03 73969.0
04 73302.0
05 72635.0
dataframe2 :-
hours total1
00 71535
01 71182
02 77628
03 75984
04 75276
05 67259
And i want output like these :-
dataframe3 :-
hours total total1 CV
00 75969.0 71535 3.006020
01 75302.0 71182 2.812594
02 74636.0 77628 1.965008
03 73969.0 75984 1.343754
04 73302.0 75276 1.328595
05 72635.0 67259 3.842910
I have done these by converting pyspark-dataframe to pandas dataframe but i want to calculate CV without going to pandas.
I have done like these
pd1=dataframe1.toPandas()
pd2=dataframe2.toPandas()
a4=[]
list1=[]
count=len(pd1)
print(count)
import numpy as np
for i in range(count):
del a4[:]
p9=(pd1.total[i])
p10=(pd2.total1[i])
a4.append(p10)
a4.append(p9)
standard_d1=np.std(a4,ddof=0)
mean1=np.mean(a4)
cv=(standard_d1/mean1)*100
list1.append(cv)
pd1['cv']=list1

There are several ways to solve this:
Use window
Use udf
Use window + udf
First, let's build the DataFrame:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import mean, pandas_udf, PandasUDFType
a1 = [(0, 75969.0), (1, 75302.0), (2, 74636.0), (3, 73969.0), (4, 73302.0), (5, 72635.0)]
a2 = [(0, 71535.0), (1, 71182.0), (2, 77628.0), (3, 75984.0), (4, 75276.0), (5, 67259.0)]
df1 = spark.createDataFrame(a1, ['hours', 'total'])
df2 = spark.createDataFrame(a2, ['hours', 'total'])
df = df1.union(df2)
df.show()
+-----+-------+
|hours| total|
+-----+-------+
| 0|75969.0|
| 1|75302.0|
| 2|74636.0|
| 3|73969.0|
| 4|73302.0|
| 5|72635.0|
| 0|71535.0|
| 1|71182.0|
| 2|77628.0|
| 3|75984.0|
| 4|75276.0|
| 5|67259.0|
+-----+-------+
Using udf only
#pandas_udf(FloatType(), PandasUDFType.GROUPED_AGG)
def _udf(v):
return 100.0*np.std(v, ddof=0)/np.mean(v)
df = df.groupBy('hours').agg(_udf(df['total']).alias('CV')).orderBy('hours')
df.show()
Use window
w = Window.partitionBy('hours')
df = df.withColumn('std', F.stddev_pop('total').over(w))
df = df.withColumn('mean', F.mean('total').over(w))
df = df.withColumn('CV', 100.0*df['std']/df['mean']).dropDuplicates(['hours']).drop(*['total', 'std', 'mean']).orderBy('hours')
df.show()
Use window + udf
w = Window.partitionBy('hours')
df = df.withColumn('CV',_udf('total').over(w)).dropDuplicates(['hours']).orderBy('hours')
df.show()
All the above methods give you the results:
+-----+---------+
|hours| CV|
+-----+---------+
| 0| 3.00602|
| 1| 2.812594|
| 2|1.9650081|
| 3|1.3437544|
| 4| 1.328595|
| 5|3.8429096|
+-----+---------+

Here since you have only two elements, we have the property that standard_dev = |total - total1| / 2. Since mean = (total + total1) / 2, we have CV = 100 * |total - total1| / (total + total1).
Therefore :
from pyspark.sql import functions as F
df = dataframe1.join(dataframe2, 'hours', 'inner')
df_final = df.withColumn('CV', F.lit(100) * F.abs(df['total'] - df['total1']) / (df['total'] + df['total1']))

Related

Date format in pyspark

My data frame looks like -
id date
1 2018-08-23 11:48:22
2 2019-05-03 06:22:01
3 2019-05-13 10:12:15
4 2019-01-22 16:13:29
5 2018-11-27 11:17:19
My expected output is -
id date date1
1 2018-08-23 11:48:22 2018-08
2 2019-05-03 06:22:01 2019-05
3 2019-05-13 10:12:15 2019-05
4 2019-01-22 16:13:29 2019-01
5 2018-11-27 11:17:19 2018-11
How to do it in pyspark?
I think you are trying to drop day and time details, you can use date_format function for it
>>> df.show()
+---+-------------------+
| id| date|
+---+-------------------+
| 1|2018-08-23 11:48:22|
| 2|2019-05-03 06:22:01|
| 3|2019-05-13 10:12:15|
| 4|2019-01-22 16:13:29|
| 5|2018-11-27 11:17:19|
+---+-------------------+
>>> import pyspark.sql.functions as F
>>>
>>> df.withColumn('date1',F.date_format(F.to_date('date','yyyy-MM-dd HH:mm:ss'),'yyyy-MM')).show()
+---+-------------------+-------+
| id| date| date1|
+---+-------------------+-------+
| 1|2018-08-23 11:48:22|2018-08|
| 2|2019-05-03 06:22:01|2019-05|
| 3|2019-05-13 10:12:15|2019-05|
| 4|2019-01-22 16:13:29|2019-01|
| 5|2018-11-27 11:17:19|2018-11|
+---+-------------------+-------+
via to_date and then substr functions ... example:
import pyspark.sql.functions as F
import pyspark.sql.types as T
rawData = [(1, "2018-08-23 11:48:22"),
(2, "2019-05-03 06:22:01"),
(3, "2019-05-13 10:12:15")]
df = spark.createDataFrame(rawData).toDF("id","my_date")
df.withColumn("new_my_date",\
F.substring(F.to_date(F.col("my_date")), 1,7))\
.show()
+---+-------------------+-----------+
| id| my_date|new_my_date|
+---+-------------------+-----------+
| 1|2018-08-23 11:48:22| 2018-08|
| 2|2019-05-03 06:22:01| 2019-05|
| 3|2019-05-13 10:12:15| 2019-05|
+---+-------------------+-----------+
import pyspark.sql.functions as F
split_col = F.split(df['date'], '-')
df = df.withColumn('year', split_col.getItem(0)).withColumn('month', split_col.getItem(1))
df = df.select(F.concat(df['year'], F.lit('-'),df['month']).alias('year_month'))
df.show()
+----------+
|year_month|
+----------+
| 2018-08|
| 2019-05|
| 2019-05|
| 2019-01|
| 2018-11|
+----------+

regular expression pyspark dataframe column

My dataframe looks like this.
I have a pyspark dataframe and I want to split column A into A1 and A2 like this using regex but that didn't work.
A | A1 | A2
20-13-2012-monday 20-13-2012 monday
20-14-2012-tues 20-14-2012 tues
20-13-2012-wed 20-13-2012 wed
My code looks like this
import re
from pyspark.sql.functions import regexp_extract
reg = r'^([\d]+-[\d]+-[\d]+)'
df=df.withColumn("A1",re.match(reg, df.select(['A'])).group())
df.show()
You can use the regex as an udf and achieve the required output like this:
>>> import re
>>> from pyspark.sql.types import *
>>> from pyspark.sql.functions import udf
>>> def get_date_day(a):
... x, y = re.split('^([\d]+-[\d]+-[\d]+)', a)[1:]
... return [x, y[1:]]
>>> get_date_day('20-13-2012-monday')
['20-13-2012', 'monday']
>>> get_date_day('20-13-2012-monday')
['20-13-2012', '-monday']
>>> get_date_udf = udf(get_date_day, ArrayType(StringType()))
>>> df = sc.parallelize([('20-13-2012-monday',), ('20-14-2012-tues',), ('20-13-2012-wed',)]).toDF(['A'])
>>> df.show()
+-----------------+
| A|
+-----------------+
|20-13-2012-monday|
| 20-14-2012-tues|
| 20-13-2012-wed|
+-----------------+
>>> df = df.withColumn("A12", get_date_udf('A'))
>>> df.show(truncate=False)
+-----------------+--------------------+
|A |A12 |
+-----------------+--------------------+
|20-13-2012-monday|[20-13-2012, monday]|
|20-14-2012-tues |[20-14-2012, tues] |
|20-13-2012-wed |[20-13-2012, wed] |
+-----------------+--------------------+
>>> df = df.withColumn("A1", udf(lambda x:x[0])('A12')).withColumn("A2", udf(lambda x:x[1])('A12'))
>>> df = df.drop('A12')
>>> df.show(truncate=False)
+-----------------+----------+------+
|A |A1 |A2 |
+-----------------+----------+------+
|20-13-2012-monday|20-13-2012|monday|
|20-14-2012-tues |20-14-2012|tues |
|20-13-2012-wed |20-13-2012|wed |
+-----------------+----------+------+
Hope this helps!

Comparing Previous data with Current data in spark scala

I want to compare Prev.data with Current data by monthly . I am having data like below.
Data-set 1 : (Prev) Data-set 2 : (Latest)
Year-month Sum-count Year-Month Sum-count
-- -- 201808 48
201807 30 201807 22
201806 20 201806 20
201805 35 201805 20
201804 12 201804 9
201803 15 -- --
I have data sets like as shown above. I want to compare the both data sets based on year-month column and sum-count and need to find out difference in percentage.
I am using spark 2.3.0 and Scala 2.11.
Here is mode :
import org.apache.spark.sql.functions.lag
val mdf = spark.read.format("csv").
option("InferSchema","true").
option("header","true").
option("delimiter",",").
option("charset","utf-8").
load("c:\\test.csv")
mdf.createOrReplaceTempView("test")
val res= spark.sql("select year-month,SUM(Sum-count) as SUM_AMT from test d group by year-month")
val win = org.apache.spark.sql.expressions.Window.orderBy("data_ym")
val res1 = res.withColumn("Prev_month", lag("SUM_AMT", 1,0).over(win)).withColumn("percentage",col("Prev_month") / sum("SUM_AMT").over()).show()
I need output like this :
if percentage is more than 10% then i need to set flag as F.
set1 cnt set2 cnt output(Percentage) Flag
201807 30 201807 22 7% T
201806 20 201806 20 0% T
201805 35 201805 20 57% F
Please help me on this.
Can be done in such way:
val data1 = List(
("201807", 30),
("201806", 20),
("201805", 35),
("201804", 12),
("201803", 15)
)
val data2 = List(
("201808", 48),
("201807", 22),
("201806", 20),
("201805", 20),
("201804", 9)
)
val df1 = data1.toDF("Year-month", "Sum-count")
val df2 = data2.toDF("Year-month", "Sum-count")
val joined = df1.alias("df1").join(df2.alias("df2"), "Year-month")
joined
.withColumn("output(Percentage)", abs($"df1.Sum-count" - $"df2.Sum-count").divide($"df1.Sum-count"))
.withColumn("Flag", when($"output(Percentage)" > 0.1, "F").otherwise("T"))
.show(false)
Output:
+----------+---------+---------+-------------------+----+
|Year-month|Sum-count|Sum-count|output(Percentage) |Flag|
+----------+---------+---------+-------------------+----+
|201807 |30 |22 |0.26666666666666666|F |
|201806 |20 |20 |0.0 |T |
|201805 |35 |20 |0.42857142857142855|F |
|201804 |12 |9 |0.25 |F |
+----------+---------+---------+-------------------+----+
Here's my solution :
val values1 = List(List("1201807", "30")
,List("1201806", "20") ,
List("1201805", "35"),
List("1201804","12"),
List("1201803","15")
).map(x =>(x(0), x(1)))
val values2 = List(List("201808", "48")
,List("1201807", "22") ,
List("1201806", "20"),
List("1201805","20"),
List("1201804","9")
).map(x =>(x(0), x(1)))
import spark.implicits._
import org.apache.spark.sql.functions
val df1 = values1.toDF
val df2 = values2.toDF
df1.join(df2, Seq("_1"), "full").toDF("set", "cnt1", "cnt2")
.withColumn("percentage1", col("cnt1")/sum("cnt1").over() * 100)
.withColumn("percentage2", col("cnt2")/sum("cnt2").over() * 100)
.withColumn("percentage", abs(col("percentage2") - col("percentage1")))
.withColumn("flag", when(col("percentage") > 10, "F").otherwise("T")).na.drop().show()
Here's the result :
+-------+----+----+------------------+------------------+------------------+----+
| set|cnt1|cnt2| percentage1| percentage2| percentage|flag|
+-------+----+----+------------------+------------------+------------------+----+
|1201804| 12| 9|10.714285714285714| 7.563025210084033| 3.15126050420168| T|
|1201807| 30| 22|26.785714285714285|18.487394957983195| 8.29831932773109| T|
|1201806| 20| 20|17.857142857142858| 16.80672268907563|1.0504201680672267| T|
|1201805| 35| 20| 31.25| 16.80672268907563|14.443277310924369| F|
+-------+----+----+------------------+------------------+------------------+----+
I hope it helps :)

How to use a window function to count day of week occurrences in Pyspark 2.1

With the below pyspark dataset (2.1), how to you use a windowing function that would count the number of times the current record's day of week appeared int he last 28 days.
Example Data frame:
from pyspark.sql import functions as F
df = sqlContext.createDataFrame([
("a", "1", "2018-01-01 12:01:01","Monday"),
("a", "13", "2018-01-01 14:01:01","Monday"),
("a", "22", "2018-01-02 22:01:01","Tuesday"),
("a", "43", "2018-01-08 01:01:01","Monday"),
("a", "43", "2018-01-09 01:01:01","Tuesday"),
("a", "74", "2018-01-10 12:01:01","Wednesday"),
("a", "95", "2018-01-15 06:01:01","Monday"),
], ["person_id", "other_id", "timestamp","dow"])
df.withColumn("dow_count",`some window function`)
Possible window
from pyspark.sql import Window
from pyspark.sql import functions as F
Days_28 = (86400 * 28)
window= Window.partitionBy("person_id").orderBy('timestamp').rangeBetween(-Days_30, -1)
## I know this next line is wrong
df.withColumn("dow_count",F.sum(F.when(Current_day=windowed_day,1).otherwise(0)).over(window))
Example Output
df.show()
+---------+--------+-------------------+---------+---------+
|person_id|other_id| timestamp| dow|dow_count|
+---------+--------+-------------------+---------+---------+
| a| 1|2018-01-01 12:01:01| Monday|0 |
| a| 13|2018-01-01 14:01:01| Monday|1 |
| a| 22|2018-01-02 22:01:01| Tuesday|0 |
| a| 43|2018-01-08 01:01:01| Monday|2 |
| a| 43|2018-01-09 01:01:01| Tuesday|1 |
| a| 74|2018-01-10 12:01:01|Wednesday|0 |
| a| 95|2018-01-15 06:01:01| Monday|3 |
+---------+--------+-------------------+---------+---------+
Use F.row_number(), window partitioned by (person_id, dow) and the logic with your rangeBetween() should be replaced with where():
from datetime import timedelta, datetime
N_days = 28
end = datetime.combine(datetime.today(), datetime.min.time())
start = end - timedelta(days=N_days)
window = Window.partitionBy("person_id", "dow").orderBy('timestamp')
df.where((df.timestamp < end) & (df.timestamp >= start)) \
.withColumn('dow_count', F.row_number().over(window)-1) \
.show()
I figured it out and thought I'd share.
First create a unix timestamp and cast it to long.
Then, partition by person and day of week.
Finally, use the count function over the window.
from pyspark.sql import functions as F
df = df.withColumn('unix_ts',df.timestamp.astype('Timestamp').cast("long"))
w = Window.partitionBy('person_id','dow').orderBy('unix_ts').rangeBetween(-86400*15,-1)
df = df.withColumn('occurrences_in_7_days',F.count('unix_ts').over(w))
df.sort(df.unix_ts).show()
Bonus: How to create the actual day of week from the timestamp.
df = df.withColumn("DayOfWeek",F.date_format(df.timestamp, 'EEEE'))
I couldn't have done it without tips from jxc and this stackoverflow article.

Filling missing dates in spark dataframe column

I've a spark data frame with columns - "date" of type timestamp and "quantity" of type long. For each date, I've some value for quantity. The dates are sorted in increasing order. But there are some dates which are missing.
For eg -
Current df -
Date | Quantity
10-09-2016 | 1
11-09-2016 | 2
14-09-2016 | 0
16-09-2016 | 1
17-09-2016 | 0
20-09-2016 | 2
As you can see, the df has some missing dates like 12-09-2016, 13-09-2016 etc. I want to put 0 in the quantity field for those missing dates such that resultant df should look like -
Date | Quantity
10-09-2016 | 1
11-09-2016 | 2
12-09-2016 | 0
13-09-2016 | 0
14-09-2016 | 0
15-09-2016 | 0
16-09-2016 | 1
17-09-2016 | 0
18-09-2016 | 0
19-09-2016 | 0
20-09-2016 | 2
Any help/suggestion regarding this will be appreciated. Thanks in advance.
Note that I am coding in scala.
I have written this answer in a bit verbose way for easy understanding of the code. It can be optimized.
Needed imports
import java.time.format.DateTimeFormatter
import java.time.{LocalDate, LocalDateTime}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{LongType, TimestampType}
UDFs for String to Valid date format
val date_transform = udf((date: String) => {
val dtFormatter = DateTimeFormatter.ofPattern("d-M-y")
val dt = LocalDate.parse(date, dtFormatter)
"%4d-%2d-%2d".format(dt.getYear, dt.getMonthValue, dt.getDayOfMonth)
.replaceAll(" ", "0")
})
Below UDF code taken from Iterate over dates range
def fill_dates = udf((start: String, excludedDiff: Int) => {
val dtFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
val fromDt = LocalDateTime.parse(start, dtFormatter)
(1 to (excludedDiff - 1)).map(day => {
val dt = fromDt.plusDays(day)
"%4d-%2d-%2d".format(dt.getYear, dt.getMonthValue, dt.getDayOfMonth)
.replaceAll(" ", "0")
})
})
Setting up sample dataframe (df)
val df = Seq(
("10-09-2016", 1),
("11-09-2016", 2),
("14-09-2016", 0),
("16-09-2016", 1),
("17-09-2016", 0),
("20-09-2016", 2)).toDF("date", "quantity")
.withColumn("date", date_transform($"date").cast(TimestampType))
.withColumn("quantity", $"quantity".cast(LongType))
df.printSchema()
root
|-- date: timestamp (nullable = true)
|-- quantity: long (nullable = false)
df.show()
+-------------------+--------+
| date|quantity|
+-------------------+--------+
|2016-09-10 00:00:00| 1|
|2016-09-11 00:00:00| 2|
|2016-09-14 00:00:00| 0|
|2016-09-16 00:00:00| 1|
|2016-09-17 00:00:00| 0|
|2016-09-20 00:00:00| 2|
+-------------------+--------+
Create a temporary dataframe(tempDf) to union with df:
val w = Window.orderBy($"date")
val tempDf = df.withColumn("diff", datediff(lead($"date", 1).over(w), $"date"))
.filter($"diff" > 1) // Pick date diff more than one day to generate our date
.withColumn("next_dates", fill_dates($"date", $"diff"))
.withColumn("quantity", lit("0"))
.withColumn("date", explode($"next_dates"))
.withColumn("date", $"date".cast(TimestampType))
tempDf.show(false)
+-------------------+--------+----+------------------------+
|date |quantity|diff|next_dates |
+-------------------+--------+----+------------------------+
|2016-09-12 00:00:00|0 |3 |[2016-09-12, 2016-09-13]|
|2016-09-13 00:00:00|0 |3 |[2016-09-12, 2016-09-13]|
|2016-09-15 00:00:00|0 |2 |[2016-09-15] |
|2016-09-18 00:00:00|0 |3 |[2016-09-18, 2016-09-19]|
|2016-09-19 00:00:00|0 |3 |[2016-09-18, 2016-09-19]|
+-------------------+--------+----+------------------------+
Now union two dataframes
val result = df.union(tempDf.select("date", "quantity"))
.orderBy("date")
result.show()
+-------------------+--------+
| date|quantity|
+-------------------+--------+
|2016-09-10 00:00:00| 1|
|2016-09-11 00:00:00| 2|
|2016-09-12 00:00:00| 0|
|2016-09-13 00:00:00| 0|
|2016-09-14 00:00:00| 0|
|2016-09-15 00:00:00| 0|
|2016-09-16 00:00:00| 1|
|2016-09-17 00:00:00| 0|
|2016-09-18 00:00:00| 0|
|2016-09-19 00:00:00| 0|
|2016-09-20 00:00:00| 2|
+-------------------+--------+
Based on the #mrsrinivas excellent answer, here is the PySpark version.
Needed imports
from typing import List
import datetime
from pyspark.sql import DataFrame, Window
from pyspark.sql.functions import col, lit, udf, datediff, lead, explode
from pyspark.sql.types import DateType, ArrayType
UDF to create the range of next dates
def _get_next_dates(start_date: datetime.date, diff: int) -> List[datetime.date]:
return [start_date + datetime.timedelta(days=days) for days in range(1, diff)]
Function the create the DateFrame filling the dates (support "grouping" columns):
def _get_fill_dates_df(df: DataFrame, date_column: str, group_columns: List[str], fill_column: str) -> DataFrame:
get_next_dates_udf = udf(_get_next_dates, ArrayType(DateType()))
window = Window.orderBy(*group_columns, date_column)
return df.withColumn("_diff", datediff(lead(date_column, 1).over(window), date_column)) \
.filter(col("_diff") > 1).withColumn("_next_dates", get_next_dates_udf(date_column, "_diff")) \
.withColumn(fill_column, lit("0")).withColumn(date_column, explode("_next_dates")) \
.drop("_diff", "_next_dates")
The usage of the function:
fill_df = _get_fill_dates_df(df, "Date", [], "Quantity")
df = df.union(fill_df)
It assumes that the date column is already in date type.
Here is a slight modification, to use this function with months and enter measure columns (columns that should be set to zero) instead of group columns:
from typing import List
import datetime
from dateutil import relativedelta
import math
import pyspark.sql.functions as f
from pyspark.sql import DataFrame, Window
from pyspark.sql.types import DateType, ArrayType
def fill_time_gaps_date_diff_based(df: pyspark.sql.dataframe.DataFrame, measure_columns: list, date_column: str):
group_columns = [col for col in df.columns if col not in [date_column]+measure_columns]
# save measure sums for qc
qc = df.agg({col: 'sum' for col in measure_columns}).collect()
# convert month to date
convert_int_to_date = f.udf(lambda mth: datetime.datetime(year=math.floor(mth/100), month=mth%100, day=1), DateType())
df = df.withColumn(date_column, convert_int_to_date(date_column))
# sort values
df = df.orderBy(group_columns)
# get_fill_dates_df (instead of months_between also use date_diff for days)
window = Window.orderBy(*group_columns, date_column)
# calculate diff column
fill_df = df.withColumn(
"_diff",
f.months_between(f.lead(date_column, 1).over(window), date_column).cast(IntegerType())
).filter(
f.col("_diff") > 1
)
# generate next dates
def _get_next_dates(start_date: datetime.date, diff: int) -> List[datetime.date]:
return [
start_date + relativedelta.relativedelta(months=months)
for months in range(1, diff)
]
get_next_dates_udf = f.udf(_get_next_dates, ArrayType(DateType()))
fill_df = fill_df.withColumn(
"_next_dates",
get_next_dates_udf(date_column, "_diff")
)
# set measure columns to 0
for col in measure_columns:
fill_df = fill_df.withColumn(col, f.lit(0))
# explode next_dates column
fill_df = fill_df.withColumn(date_column, f.explode('_next_dates'))
# drop unneccessary columns
fill_df = fill_df.drop(
"_diff",
"_next_dates"
)
# union df with fill_df
df = df.union(fill_df)
# qc: should be removed for productive runs
if qc != df.agg({col: 'sum' for col in measure_columns}).collect():
raise ValueError('Sums before and after run do not fit.')
return df
Please note, that I assume that the month is given as Integer in the form YYYYMM. This could easily be adjusted by modifying the "convert month to date" part.