groupby function on a calculated column - group-by

I am joining multiple dataframes
and I am calculating the output by multiplying two columns from two diff dataframes and dividing it with a column belonging to another dataframe.
I get grouping sequence expression is empty error and no_order is not an aggregate function
whats is wrong with the code
df = df1.join(df2,df2["Code"] == df1["Code"],how = 'left')\
.join(df3, df3["ID"] == df1["ID"],how = 'left')\
.join(df4, df4["ID"] == df1["ID"],how = 'left')\
.join(df5, df5["Scenario"] == df1["Status"],how='left')\
.withColumn("Country",when(df1.Ind == 1,"WI"))\
.withColumn("Country",when(df1.Ind == 0,"AA"))\
.withColumn("Year",when(df1.Year == "2020","2021"))\
.agg((sum(df5["amt"] * df1["cost"]))/df2["no_order"]).alias('output')
.groupby('Country','Year','output')

the error shows you that df2["no_order"] should be withing some aggregation function, for example the sum which you are using for df5["amt"] * df1["cost"].
Also move .groupby() above .agg().
If I got correctly what you are trying to achieve, the code should look like:
df = df1\
.join(df2, on = 'Code', how = 'left')\
.join(df3, on = 'ID', how = 'left')\
.join(df4, on = 'ID', how = 'left')\
.join(df5, df5.Scenario == df1.Status, how='left')\
.withColumn('Country', when(df1.Ind == 1,"WI").when(df1.Ind == 0,"AA"))\
.withColumn('Year', when(df1.Year == "2020","2021"))\
.groupby('Country','Year')\
.agg(sum(df5["amt"] * df1["cost"] / df2["no_order"]).alias('output'))

Related

Implement Louvain in pyspark using dataframes

I'm trying to implement the Louvain algorihtm in pyspark using dataframes. The problem is that my implementation is reaaaally slow. This is how I do it:
I collect all vertices and communityIds into simple python lists
For each vertex - communityId pair I calculate the modularity gain using dataframes (just a fancy formula involving edge weights sums/differences)
Repeat untill no change
What am I doing wrong?
I suppose that if I could somehow parallelize the for each loop the performance would increase, but how can I do that?
LATER EDIT:
I could use vertices.foreach(changeCommunityId) instead of the for each loop, but then I'd have to compute the modularity gain (that fancy formula) without dataframes.
See the code sample below:
def louvain(self):
oldModularity = 0 # since intially each node represents a community
graph = self.graph
# retrieve graph vertices and edges dataframes
vertices = verticesDf = self.graph.vertices
aij = edgesDf = self.graph.edges
canOptimize = True
allCommunityIds = [row['communityId'] for row in verticesDf.select('communityId').distinct().collect()]
verticesIdsCommunityIds = [(row['id'], row['communityId']) for row in verticesDf.select('id', 'communityId').collect()]
allEdgesSum = self.graph.edges.groupBy().sum('weight').collect()
m = allEdgesSum[0]['sum(weight)']/2
def computeModularityGain(vertexId, newCommunityId):
# the sum of all weights of the edges within C
sourceNodesNewCommunity = vertices.join(aij, vertices.id == aij.src) \
.select('weight', 'src', 'communityId') \
.where(vertices.communityId == newCommunityId);
destinationNodesNewCommunity = vertices.join(aij, vertices.id == aij.dst) \
.select('weight', 'dst', 'communityId') \
.where(vertices.communityId == newCommunityId);
k_in = sourceNodesNewCommunity.join(destinationNodesNewCommunity, sourceNodesNewCommunity.communityId == destinationNodesNewCommunity.communityId) \
.count()
# the rest of the formula computation goes here, I just wanted to show you an example
# just return some value for the modularity
return 0.9
def changeCommunityId(vertexId, currentCommunityId):
maxModularityGain = 0
maxModularityGainCommunityId = None
for newCommunityId in allCommunityIds:
if (newCommunityId != currentCommunityId):
modularityGain = computeModularityGain(vertexId, newCommunityId)
if (modularityGain > maxModularityGain):
maxModularityGain = modularityGain
maxModularityGainCommunityId = newCommunityId
if (maxModularityGain > 0):
return maxModularityGainCommunityId
return currentCommunityId
while canOptimize:
while self.changeInModularity:
self.changeInModularity = False
for vertexCommunityIdPair in verticesIdsCommunityIds:
vertexId = vertexCommunityIdPair[0]
currentCommunityId = vertexCommunityIdPair[1]
newCommunityId = changeCommunityId(vertexId, currentCommunityId)
self.changeInModularity = False
canOptimize = False

Group_by returns just one row while aggregate returns the expected outcome

I am currently stuck at the post-processing of some EddyData. Following an example (https://github.com/bgctw/REddyProc/blob/master/vignettes/aggUncertainty.md) I came up with an unexpected outcome of group_by which is reproducible but I don't understand why.
Group_by returns just one row while aggregate gives the expected outcome.
Here is a minimal example:
library(tidyverse)
#create example data frame
date.time <- seq(from=as.POSIXct("2015-01-01 00:30:00"), to=as.POSIXct("2015-01-03 00:30:00"),by="30 mins")
nee <- runif(length(date.time),-200,200)
df <- data.frame(date.time, nee)
#calculate day of the year
df <- df %>% mutate(
date.time = df$date.time
, DoY = as.POSIXlt(date.time - 15*60)$yday # midnight belongs to the previous
)
#trying to summarise nee for each day
aggDay <- df %>% group_by(DoY) %>% summarise(nee=sum(nee))
aggDay
nee
1 322.1195
aggDay just returns one row while aggregate would work in this case
aggregate(df$nee, by=list(df$DoY), sum)
Group.1 x
1 0 -25.15698
2 1 448.13960
3 2 -100.86310
Unfortunately, the original code involves some further calculations which is the reason why I'd like to stay with group_by.
#original code, not reproducible here
aggDay <- df %>% group_by(DoY) %>%
summarise(
DateTime = first(DateTime)
, nRec = sum( NEE_uStar_fqc == 0, na.rm = TRUE)
, nEff = computeEffectiveNumObs(
resid, effAcf = !!autoCorr, na.rm = TRUE)
, NEE = mean(NEE_uStar_f, na.rm = TRUE)
, sdNEE = if (nEff <= 1) NA_real_ else sqrt(
mean(NEE_uStar_fsd^2, na.rm = TRUE) / (nEff - 1))
, sdNEEuncorr = if (nRec == 0) NA_real_ else sqrt(
mean(NEE_uStar_fsd^2, na.rm = TRUE) / (nRec - 1))
)
I restarted RStudio and now it works. Don't ask me. There must have been a problem with another loaded package.

How to get values from a dataframe column using SparkSQL?

Right now I am working with Spark/Scala and I am trying to join multiple dataframes to get the expected output.
The data input are CSV files with call record information. These are the input main fields.
a_number:String = is the origin call number.
area_code_a:String = is the a_number area code.
prefix_a:String = is the a_number prefix.
b_number:String = is the destination call number.
area_code_b:String = is the b_number area code.
prefix_b:String = is the b_number prefix.
cause_value:String = is the call final status.
val dfint = ((cdrs_nac.join(grupos_nac).where(col("causevalue") === col("id")))
.join(centrales_nac, col("dpc") === col("pointcode_decimal"), "left")
.join(series_nac_a).where(col("area_code_a") === col("codigo_area") &&
col("prefix_a") === col("prefijo") &&
col("series_a") >= col("serie_inicial") &&
col("series_a") <= col("serie_final"))
.join(series_nac_b, (
((col("codigo_area_b") === col("area_code_b")) && col("len_b_number") == "8") ||
((col("codigo_area_b") === col("area_code_b")) && col("len_b_number") == "10") ||
((col("codigo_area_b") === col("codigo_area_cent")) && col("len_b_number") == "7")) &&
col("prefix_b") === col("prefijo_b") &&
col("series_b") >= col("serie_inicial_b") &&
col("series_b") <= col("serie_final_b"), "left")
This generates a multiple output files with the call data records processed, including the column "len_b_number" which means the length of the b_number field.
I was doing some tests I already find that for some reason the expression "col("len_b_number")" is returning the column name "len_b_number" instead the length values which are 7, 8 or 10. This means that the col("len_b_number") == 7 OR col("len_b_number") == 8 OR col("len_b_number") == 10 conditions will never work because the code will always compare with the column name.
At this moment the output is blank because the col("len_b_number") doesnt match with 7, 8 or 10. I will like to know if ypou can help to understand how to extract the value from this column.
Thanks
Try using === instead of ==.
I could not get your error.
&& col("len_b_number") == "8"
should be:
&& col("len_b_number") === "8"

User Defined Aggregate Function in Spark to implement percentile

I am trying to write udaf to calculate the percentile values.
I need to write the custom function because existing spark function percentile_approx, approx_percentile and percentile uses rounding differently than my need.
I need to use floor instead of midpoint rounding. Is there anyway I can write it in pyspark?
If not how to achieve this in scala?
I need to calculate the percentile using below method:
def percentile_custom(lst, per):
lst.sorted()
rank = (len(lst)+1)*per
ir = math.floor(rank)
ir1 = math.ceil(rank)
if (ir == ir1):
return lst[ir-1]
else:
fr = rank - ir
ir_qh = lst[ir-1]
ir_qh1 = lst[ir]
inter = ((ir_qh1 - ir_qh)*fr) + ir_qh
return math.floor(inter)
Below is the function for the same I have written in pyspark, let me know in case it didn't work out for you :
from pyspark.sql import Window
import math
import pyspark.sql.types as T
import pyspark.sql.functions as F
def calc_percentile(perc_df, part_col, order_col, p_val=[33,66], num_bins=100, max_bins = 100, perc_col="p_band"):
"""
Calculate percentile with nimber of bins on specified columns
"""
win = Window.partitionBy(*part_col).orderBy(order_col)
def perc_func(col, num, max_bins):
step = max_bins / num
return {(p_tile / step): int(
math.ceil(col * (p_tile / float(max_bins)))
) for p_tile in range(step, max_bins + step, step)}
perc_udf = F.udf(perc_func, T.MapType(T.IntegerType(), T.IntegerType()))
# perc_df.show()
rank_data = perc_df.filter(
F.col(order_col).isNotNull()
).withColumn(
"rank", F.dense_rank().over(win)
)
rank_data.persist()
rank_data.count()
overall_count_data = rank_data.groupBy(
*part_col
).agg(
F.max(
F.col("rank")
).alias("count")
).select(
F.explode(
perc_udf(F.col("count"), F.lit(num_bins), F.lit(max_bins))
).alias("n_tile", "rank"), "count",
*part_col
)
overall_count_data.persist()
overall_count_data.count()
return overall_count_data.join(
rank_data, part_col + ["rank"]
).withColumn(
perc_col,
F.concat(F.lit("P_"), F.col("n_tile").cast("string"))
).groupBy(
*part_col
).pivot(
perc_col, ["P_{0}".format(p_val1) for p_val1 in p_val]
).agg(
F.max(order_col)
).select(
*(
part_col + [F.col("P_{0}".format(p_val1)) for p_val1 in p_val]
)
)

selecting max value in the column

I have a data like this
TagID,ListnerID,Timestamp,Sum_RSSI
2,101,1496745906,90
3,102,1496745907,70
3,104,1496745906,80
2,101,1496745909,60
4,106,1496745908,60
My expected output would be
2,101,1496745906,90
3,104,1496745906,80
4,106,1496745908,60
I tried like this
val high_window = Window.partitionBy($"tagShortID")
val prox = averageDF
.withColumn("rank", row_number().over(window.orderBy($"Sum_RSSI".desc)))
.filter($"rank" === 1)
But it prints all the rows. Any help would be appreciated.