User Defined Aggregate Function in Spark to implement percentile - scala

I am trying to write udaf to calculate the percentile values.
I need to write the custom function because existing spark function percentile_approx, approx_percentile and percentile uses rounding differently than my need.
I need to use floor instead of midpoint rounding. Is there anyway I can write it in pyspark?
If not how to achieve this in scala?
I need to calculate the percentile using below method:
def percentile_custom(lst, per):
lst.sorted()
rank = (len(lst)+1)*per
ir = math.floor(rank)
ir1 = math.ceil(rank)
if (ir == ir1):
return lst[ir-1]
else:
fr = rank - ir
ir_qh = lst[ir-1]
ir_qh1 = lst[ir]
inter = ((ir_qh1 - ir_qh)*fr) + ir_qh
return math.floor(inter)

Below is the function for the same I have written in pyspark, let me know in case it didn't work out for you :
from pyspark.sql import Window
import math
import pyspark.sql.types as T
import pyspark.sql.functions as F
def calc_percentile(perc_df, part_col, order_col, p_val=[33,66], num_bins=100, max_bins = 100, perc_col="p_band"):
"""
Calculate percentile with nimber of bins on specified columns
"""
win = Window.partitionBy(*part_col).orderBy(order_col)
def perc_func(col, num, max_bins):
step = max_bins / num
return {(p_tile / step): int(
math.ceil(col * (p_tile / float(max_bins)))
) for p_tile in range(step, max_bins + step, step)}
perc_udf = F.udf(perc_func, T.MapType(T.IntegerType(), T.IntegerType()))
# perc_df.show()
rank_data = perc_df.filter(
F.col(order_col).isNotNull()
).withColumn(
"rank", F.dense_rank().over(win)
)
rank_data.persist()
rank_data.count()
overall_count_data = rank_data.groupBy(
*part_col
).agg(
F.max(
F.col("rank")
).alias("count")
).select(
F.explode(
perc_udf(F.col("count"), F.lit(num_bins), F.lit(max_bins))
).alias("n_tile", "rank"), "count",
*part_col
)
overall_count_data.persist()
overall_count_data.count()
return overall_count_data.join(
rank_data, part_col + ["rank"]
).withColumn(
perc_col,
F.concat(F.lit("P_"), F.col("n_tile").cast("string"))
).groupBy(
*part_col
).pivot(
perc_col, ["P_{0}".format(p_val1) for p_val1 in p_val]
).agg(
F.max(order_col)
).select(
*(
part_col + [F.col("P_{0}".format(p_val1)) for p_val1 in p_val]
)
)

Related

Pyspark / Databricks. Kolmogorov - Smirnov over time. Efficiently. In parallel

Hello StackOverflowers.
I have a pyspark dataframe that consists of a time_column and a column with values.
E.g.
+----------+--------------------+
| snapshot| values|
+----------+--------------------+
|2005-01-31| 0.19120256617637743|
|2005-01-31| 0.7972692479278891|
|2005-02-28|0.005236883665445502|
|2005-02-28| 0.5474099672222935|
|2005-02-28| 0.13077227571485905|
+----------+--------------------+
I would like to perform a KS test of each snapshot value with the previous one.
I tried to do it with a for loop.
import numpy as np
from scipy.stats import ks_2samp
import pyspark.sql.functions as F
def KS_for_one_snapshot(temp_df, snapshots_list, j, var = "values"):
sample1=temp_df.filter(F.col("snapshot")==snapshots_list[j])
sample2=temp_df.filter(F.col("snapshot")==snapshots_list[j-1]) # pick the last snapshot as the one to compare with
if (sample1.count() == 0 or sample2.count() == 0 ):
ks_value = -1 # previously "0 observations" which gave type error
else:
ks_value, p_value = ks_2samp( np.array(sample1.select(var).collect()).reshape(-1)
, np.array(sample2.select(var).collect()).reshape(-1)
, alternative="two-sided"
, mode="auto")
return ks_value
results = []
snapshots_list = df.select('snapshot').dropDuplicates().sort('snapshot').rdd.flatMap(lambda x: x).collect()
for j in range(len(snapshots_list) - 1 ):
results.append(KS_for_one_snapshot(df, snapshots_list, j+1))
results
But the data in reality is huge so it takes forever. I am using databricks and pyspark, so I wonder what would be a more efficient way to run it by avoiding the for loop and utilizing the available workers.
I tried to do it by using a udf but in vain.
Any ideas?
PS. you can generate the data with the following code.
from random import randint
df = (spark.createDataFrame( range(1,1000), T.IntegerType())
.withColumn('snapshot' ,F.array(F.lit("2005-01-31"), F.lit("2005-02-28"),F.lit("2005-03-30") ).getItem((F.rand()*3).cast("int")))
.withColumn('values', F.rand()).drop('value')
)
Update:
I tried the following by using an UDF.
var_used = 'values'
data_input_1 = df.groupBy('snapshot').agg(F.collect_list(var_used).alias('value_list'))
data_input_2 = df.groupBy('snapshot').agg(F.collect_list(var_used).alias("value_list_2"))
windowSpec = Window.orderBy("snapshot")
data_input_2 = data_input_2.withColumn('snapshot_2', F.lag("snapshot", 1).over(Window.orderBy('snapshot'))).filter('snapshot_2 is not NULL')
data_input_final = data_input_final = data_input_1.join(data_input_2, data_input_1.snapshot == data_input_2.snapshot_2)
def KS_one_snapshot_general(sample_in_list_1, sample_in_list_2):
if (len(sample_in_list_1) == 0 or len(sample_in_list_2) == 0 ):
ks_value = -1 # previously "0 observations" which gave type error
else:
print('something')
ks_value, p_value = ks_2samp( sample_in_list_1
, sample_in_list_2
, alternative="two-sided"
, mode="auto")
return ks_value
import pyspark.sql.types as T
KS_one_snapshot_general_udf = udf(KS_one_snapshot_general, T.FloatType())
data_input_final.select( KS_one_snapshot_general_udf('value_list', 'value_list_2')).display()
Which works fine if the dataset (per snapshot) is small. But If I increase the number of rows then I end up with an error.
PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype)

Updating a Dash Callback using RadioItems

I am fairly new to python coding so I apologize in advance for my ignorance. I am trying to create a Dash App that drops outliers using standard deviation. The user selects a standard deviation using RadioItem inputs.
My question is what amendments do I need to make to my code so that the RadioItem value updates max_deviations using a callback?
Import packages, clean the data and define a query
import dash
import plotly.express as px
from dash import Dash, dcc, html, Input, Output, State
import pandas as pd
import numpy as np
app = dash.Dash(__name__)
server = app.server
df=pd.read_csv(r'C:\SVS_GIS\POWER BI\CSV_DATA\QSAS2021.csv', encoding='unicode_escape')
#SET DATE OF VALUATION
df['TIME'] = ((pd.to_datetime(df['Sale Date'], dayfirst=True)
.rsub(pd.to_datetime('01/10/2021', dayfirst=True))
.dt.days
)*-1)
df=df[df['TIME'] >= -365]
df = df.query("(SMA >=1 and SMA <= 3) and (LGA==60)")
prepare dataframe for dropping outliers
data = pd.DataFrame(data=df)
x = df.TIME
y = df.CHANGE
mean = np.mean(y)
standard_deviation = np.std(y)
distance_from_mean = abs(y - mean)
app layout
app.layout = html.Div([
html.Label("Standard Deviation Picker:", style={'fontSize':25, 'textAlign':'center'}),
html.Br(),
html.Label("1.0 = 68%, 2.0 = 95%, 3.0 = 99.7%", style={'fontSize':15,
'textAlign':'center'}),
html.Div(id="radio_items"),
dcc.RadioItems(
options=[{'label': i, 'value': i} for i in [1.0, 2.0, 3.0]],
value=2.0
),
html.Div([
dcc.Graph(id="the_graph")]
)])
callback
#app.callback(
Output("the_graph", "figure"),
Input("radio_items", 'value')
)
def update_graph(max_deviations):
not_outlier = distance_from_mean < max_deviations * standard_deviation
no_outliers = y[not_outlier]
trim_outliers = pd.DataFrame(data=no_outliers)
dff = pd.merge(trim_outliers, df, left_index=True, right_index=True)
return (dff)
fig = px.scatter(dff, x='TIME', y='CHANGE_y',
color ='SMA',
trendline='ols',
size='PV',
height=500,
width=800,
hover_name='SMA',
)
return dcc.Graph(id='the_graph', figure=fig)
if __name__ == '__main__':
app.run_server(debug=False)
Your dcc.RadioItems doesn't have an id prop. Add that, and make sure it matches the ID given in the callback, and you should be good.

scipy.integrate.nquad ignoring opts?

I need to compute a numerical (triple) integral, but do not need very high precision on the value, and would therefore like to sacrifice some precision for speed when using nquad. I thought that I might be able to do this by increasing the epsrel and/or epsabs options, but they seem to have no effect. For example (note, this is just an example integrand - I don't actually need to compute this particular integral...):
import numpy as np
from scipy.integrate import nquad
def integrand(l, b, d, sigma=250):
x = d * np.cos(l) * np.cos(b)
y = d * np.sin(l) * np.cos(b)
z = d * np.sin(b)
return np.exp(-0.5 * z**2 / sigma**2) / np.sqrt(2*np.pi * sigma**2)
ranges = [
(0, 2*np.pi),
(0.5, np.pi/2),
(0, 1000.)
]
# No specification of `opts` - use the default epsrel and epsabs:
result1 = nquad(integrand, ranges=ranges, full_output=True)
# Set some `quad` opts:
result2 = nquad(integrand, ranges=ranges, full_output=True,
opts=dict(epsabs=1e-1, epsrel=0, limit=3))
Both outputs are identical:
>>> print(result1)
(4.252394424844468, 1.525272379143154e-12, {'neval': 9261})
>>> print(result2)
(4.252394424844468, 1.525272379143154e-12, {'neval': 9261})
A full example is included here: https://gist.github.com/adrn/b9aa92c236df011dbcdc131aa94ed9f9
Is this not the right approach, or is scipy.integrate ignoring my inputted opts?
From the scipy.integrate.nquad it is stated that opts can only be passed to quad as can be seen here:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.integrate.nquad.html
Example of application:
import numpy as np
from scipy.integrate import quad
def integrand(a, sigma=250):
x = 2 * np.sin(a) * np.cos(a)
return x
# No specification of `opts` - use the default epsrel and epsabs:
result1 = quad(integrand,0, 2*np.pi)
# Set some `quad` opts:
result2 = quad(integrand,0, 4*np.pi,epsabs=1e-6, epsrel=1e-6, limit=40)
returns:
result1: (-1.3690011097614755e-16, 4.4205541621600365e-14)
result2: (-1.7062635631484713e-15, 9.096805257467047e-14)
The reason nquad doesn't complain about the presence of options is because nquad includes quad, dbquad and tplquad.

groupby function on a calculated column

I am joining multiple dataframes
and I am calculating the output by multiplying two columns from two diff dataframes and dividing it with a column belonging to another dataframe.
I get grouping sequence expression is empty error and no_order is not an aggregate function
whats is wrong with the code
df = df1.join(df2,df2["Code"] == df1["Code"],how = 'left')\
.join(df3, df3["ID"] == df1["ID"],how = 'left')\
.join(df4, df4["ID"] == df1["ID"],how = 'left')\
.join(df5, df5["Scenario"] == df1["Status"],how='left')\
.withColumn("Country",when(df1.Ind == 1,"WI"))\
.withColumn("Country",when(df1.Ind == 0,"AA"))\
.withColumn("Year",when(df1.Year == "2020","2021"))\
.agg((sum(df5["amt"] * df1["cost"]))/df2["no_order"]).alias('output')
.groupby('Country','Year','output')
the error shows you that df2["no_order"] should be withing some aggregation function, for example the sum which you are using for df5["amt"] * df1["cost"].
Also move .groupby() above .agg().
If I got correctly what you are trying to achieve, the code should look like:
df = df1\
.join(df2, on = 'Code', how = 'left')\
.join(df3, on = 'ID', how = 'left')\
.join(df4, on = 'ID', how = 'left')\
.join(df5, df5.Scenario == df1.Status, how='left')\
.withColumn('Country', when(df1.Ind == 1,"WI").when(df1.Ind == 0,"AA"))\
.withColumn('Year', when(df1.Year == "2020","2021"))\
.groupby('Country','Year')\
.agg(sum(df5["amt"] * df1["cost"] / df2["no_order"]).alias('output'))

JSC is empty when creating spark DataFrame

I'm trying to learn spark so don't judge harshly. I have the following problem. I can run spark basic examples like this one
import os
os.environ['PYSPARK_PYTHON'] = '/g/scb/patil/andrejev/python36/bin/python3'
import random
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import *
from pyspark.sql import *
sc.stop()
conf = SparkConf().setAppName('').setMaster('spark://remotehost:7789').setSparkHome('/path/to/spark-2.3.0-bin-hadoop2.7/')
sc = SparkContext(conf=conf)
num_samples = 100
def inside(p):
x, y = random.random(), random.random()
return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
but when I am creating data frame I have error that _jsc is NULL
eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
/usr/local/spark/python/pyspark/traceback_utils.py in __enter__(self)
70 def __enter__(self):
71 if SCCallSiteSync._spark_stack_depth == 0:
---> 72 self._context._jsc.setCallSite(self._call_site)
73 SCCa._spark_stack_depth += 1
Here are the environemnt variables that are set on local machine
SPARK_HOME': '/usr/local/spark/
PYSPARK_DRIVER_PYTHON: '/usr/bin/python3'
PYSPARK_DRIVER_PYTHON_OPTS: 'notebook'
PYSPARK_PYTHON: '/g/scb/patil/andrejev/python36/bin/python3'
PATH': '...:/usr/lib/jvm/java-8-oracle/jre/bin:/usr/local/hadoop/bin:/usr/local/hadoop/sbin:/usr/local/spark/bin'
and on remote machine
PYSPARK_PYTHON=/g/scb/patil/andrejev/python36/bin/python3
PYSPARK_DIRVER_PYTHON=/g/scb/patil/andrejev/python36/bin/python3
In the end I figured out that I had two sessions (one default and one that I created) running in the same time. I ended explicitly using my session to create DataFrame.
sess = SparkSession(sc)
freq_signal = sess.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])