I am trying to add few columns based on input variable vIssueCols
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
vIssueCols=['jobid','locid']
vQuery1 = 'vSrcData2= vSrcData'
vWindow1 = Window.partitionBy("vKey").orderBy("vOrderBy")
for x in vIssueCols:
Query1=vQuery1+'.withColumn("'+x+'_prev",F.lag(vSrcData.'+x+').over(vWindow1))'
exec(vQuery1)
now above query will generate vQuery1 as below, and it is working, but
vSrcData2= vSrcData.withColumn("jobid_prev",F.lag(vSrcData.jobid).over(vWindow1)).withColumn("locid_prev",F.lag(vSrcData.locid).over(vWindow1))
Cant I write a query something like
vSrcData2= vSrcData.withColumn(x+"_prev",F.lag(vSrcData.x).over(vWindow1))for x in vIssueCols
and generate the columns with the loop statement. Some blog has suggested to add a udf and call that, But instead using udf I will use above executing string method.
You can build your query using reduce.
from pyspark.sql.functions import lag
from pyspark.sql.window import Window
from functools import reduce
#sample data
df = sc.parallelize([[1, 200, '1234', 'asdf'],
[1, 50, '2345', 'qwerty'],
[1, 100, '4567', 'xyz'],
[2, 300, '123', 'prem'],
[2, 10, '000', 'ankur']]).\
toDF(["vKey","vOrderBy","jobid","locid"])
df.show()
vWindow1 = Window.partitionBy("vKey").orderBy("vOrderBy")
#your existing processing
df1= df.\
withColumn("jobid_prev",lag(df.jobid).over(vWindow1)).\
withColumn("locid_prev",lag(df.locid).over(vWindow1))
df1.show()
#to-be processing
vIssueCols=['jobid','locid']
df2 = (reduce(
lambda r_df, col_name: r_df.withColumn(col_name+"_prev", lag(r_df[col_name]).over(vWindow1)),
vIssueCols,
df
))
df2.show()
Sample data:
+----+--------+-----+------+
|vKey|vOrderBy|jobid| locid|
+----+--------+-----+------+
| 1| 200| 1234| asdf|
| 1| 50| 2345|qwerty|
| 1| 100| 4567| xyz|
| 2| 300| 123| prem|
| 2| 10| 000| ankur|
+----+--------+-----+------+
Output:
+----+--------+-----+------+----------+----------+
|vKey|vOrderBy|jobid| locid|jobid_prev|locid_prev|
+----+--------+-----+------+----------+----------+
| 1| 50| 2345|qwerty| null| null|
| 1| 100| 4567| xyz| 2345| qwerty|
| 1| 200| 1234| asdf| 4567| xyz|
| 2| 10| 000| ankur| null| null|
| 2| 300| 123| prem| 000| ankur|
+----+--------+-----+------+----------+----------+
Hope this helps!
Related
I'm trying to create a column of standardized (z-score) of a column x on a Spark dataframe, but am missing something because none of it is working.
Here's my example:
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
from scipy.stats import zscore
#pandas_udf('float')
def zscore_udf(x: pd.Series) -> pd.Series:
return zscore(x)
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
columns = ["id","x"]
data = [("a", 81.0),
("b", 36.2),
("c", 12.0),
("d", 81.0),
("e", 36.3),
("f", 12.0),
("g", 111.7)]
df = spark.createDataFrame(data=data,schema=columns)
df.show()
df = df.withColumn('y', zscore_udf(df.x))
df.show()
Which results in obviously wrong calculations:
+---+-----+----+
| id| x| y|
+---+-----+----+
| a| 81.0|null|
| b| 36.2| 1.0|
| c| 12.0|-1.0|
| d| 81.0| 1.0|
| e| 36.3|-1.0|
| f| 12.0|-1.0|
| g|111.7| 1.0|
+---+-----+----+
Thank you for your help.
How to fix:
instead of using a UDF calculate the stddev_pop and the avg of the dataframe and calculate z-score manually.
I suggest using "window function" over the entire dataframe for the first step and then a simple arithmetic to get the z-score.
see suggested code:
from pyspark.sql.functions import avg, col, stddev_pop
from pyspark.sql.window import Window
df2 = df \
.select(
"*",
avg("x").over(Window.partitionBy()).alias("avg_x"),
stddev_pop("x").over(Window.partitionBy()).alias("stddev_x"),
) \
.withColumn("manual_z_score", (col("x") - col("avg_x")) / col("stddev_x"))
Why the UDF didn't work?
Spark is used for distributed computation. When you perform operations on a DataFrame Spark distributes the workload into partitions on the executors/workers available.
pandas_udf is not different. When running a UDF from the type pd.Series -> pd.Series some rows are sent to partition X and some to partition Y, then when zscore is run it calculates the mean and std of the data in the partition and writes the zscore based on that data only.
I'll use spark_partition_id to "prove" this.
rows a,b,c were mapped in partition 0 while d,e,f,g in partition 1. I've calculated manually the mean/stddev_pop of both the entire set and the partitioned data and then calculated the z-score. the UDF z-score was equal to the z-score of the partition.
from pyspark.sql.functions import pandas_udf, spark_partition_id, avg, stddev, col, stddev_pop
from pyspark.sql.window import Window
df2 = df \
.select(
"*",
zscore_udf(df.x).alias("z_score"),
spark_partition_id().alias("partition"),
avg("x").over(Window.partitionBy(spark_partition_id())).alias("avg_partition_x"),
stddev_pop("x").over(Window.partitionBy(spark_partition_id())).alias("stddev_partition_x"),
) \
.withColumn("partition_z_score", (col("x") - col("avg_partition_x")) / col("stddev_partition_x"))
df2.show()
+---+-----+-----------+---------+-----------------+------------------+--------------------+
| id| x| z_score|partition| avg_partition_x|stddev_partition_x| partition_z_score|
+---+-----+-----------+---------+-----------------+------------------+--------------------+
| a| 81.0| 1.327058| 0|43.06666666666666|28.584533502500186| 1.3270579815484989|
| b| 36.2|-0.24022315| 0|43.06666666666666|28.584533502500186|-0.24022314955974558|
| c| 12.0| -1.0868348| 0|43.06666666666666|28.584533502500186| -1.0868348319887526|
| d| 81.0| 0.5366879| 1| 60.25|38.663063768925504| 0.5366879387524718|
| e| 36.3|-0.61945426| 1| 60.25|38.663063768925504| -0.6194542714757446|
| f| 12.0| -1.2479612| 1| 60.25|38.663063768925504| -1.247961110593097|
| g|111.7| 1.3307275| 1| 60.25|38.663063768925504| 1.3307274433163698|
+---+-----+-----------+---------+-----------------+------------------+--------------------+
I also added df.repartition(8) prior to the calculation and managed to get similar results as in the original question.
partitions with 0 stddev --> null z score, partition with 2 rows --> (-1, 1) z scores.
+---+-----+-------+---------+---------------+------------------+-----------------+
| id| x|z_score|partition|avg_partition_x|stddev_partition_x|partition_z_score|
+---+-----+-------+---------+---------------+------------------+-----------------+
| a| 81.0| null| 0| 81.0| 0.0| null|
| d| 81.0| null| 0| 81.0| 0.0| null|
| f| 12.0| null| 1| 12.0| 0.0| null|
| b| 36.2| -1.0| 6| 73.95| 37.75| -1.0|
| g|111.7| 1.0| 6| 73.95| 37.75| 1.0|
| c| 12.0| -1.0| 7| 24.15|12.149999999999999| -1.0|
| e| 36.3| 1.0| 7| 24.15|12.149999999999999| 1.0|
+---+-----+-------+---------+---------------+------------------+-----------------+
I have a following sample pyspark dataframe and after groupby I want to calculate mean, and first of multiple columns, In real case I have 100s of columns, so I cant do it individually
sp = spark.createDataFrame([['a',2,4,'cc','anc'], ['a',4,7,'cd','abc'], ['b',6,0,'as','asd'], ['b', 2, 4, 'ad','acb'],
['c', 4, 4, 'sd','acc']], ['id', 'col1', 'col2','col3', 'col4'])
+---+----+----+----+----+
| id|col1|col2|col3|col4|
+---+----+----+----+----+
| a| 2| 4| cc| anc|
| a| 4| 7| cd| abc|
| b| 6| 0| as| asd|
| b| 2| 4| ad| acb|
| c| 4| 4| sd| acc|
+---+----+----+----+----+
This is what I am trying
mean_cols = ['col1', 'col2']
first_cols = ['col3', 'col4']
sc.groupby('id').agg(*[ f.mean for col in mean_cols], *[f.first for col in first_cols])
but it's not working. How can I do it like this with pyspark
The best way for multiple functions on multiple columns is to use the .agg(*expr) format.
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import numpy as np
#Test data
tst = sqlContext.createDataFrame([(1,2,3,4),(3,4,5,1),(5,6,7,8),(7,8,9,2)],schema=['col1','col2','col3','col4'])
fn_l = [F.min,F.max,F.mean,F.first]
col_l=['col1','col2','col3']
expr = [fn(coln).alias(str(fn.__name__)+'_'+str(coln)) for fn in fn_l for coln in col_l]
tst_r = tst.groupby('col4').agg(*expr)
The result will be
tst_r.show()
+----+--------+--------+--------+--------+--------+--------+---------+---------+---------+----------+----------+----------+
|col4|min_col1|min_col2|min_col3|max_col1|max_col2|max_col3|mean_col1|mean_col2|mean_col3|first_col1|first_col2|first_col3|
+----+--------+--------+--------+--------+--------+--------+---------+---------+---------+----------+----------+----------+
| 5| 5| 6| 7| 7| 8| 9| 6.0| 7.0| 8.0| 5| 6| 7|
| 4| 1| 2| 3| 3| 4| 5| 2.0| 3.0| 4.0| 1| 2| 3|
+----+--------+--------+--------+--------+--------+--------+---------+---------+---------+----------+----------+----------+
For selectively applying functions on columns, you can have multiple expression arrays and concatenate them in aggregation.
fn_l = [F.min,F.max]
fn_2=[F.mean,F.first]
col_l=['col1','col2']
col_2=['col1','col3','col4']
expr1 = [fn(coln).alias(str(fn.__name__)+'_'+str(coln)) for fn in fn_l for coln in col_l]
expr2 = [fn(coln).alias(str(fn.__name__)+'_'+str(coln)) for fn in fn_2 for coln in col_2]
tst_r = tst.groupby('col4').agg(*(expr1+expr2))
A simpler way to do:
import pyspark.sql.functions as F
tst_r = ( tst.groupby('col4')
.agg(*[F.mean(col).alias(f"{col}_mean") for col in means_col],
*[F.first(col).alias(f"{col}_first") for col in firsts_col]) )
I have a PySpark Dataframe with two columns:
+---+----+
| Id|Rank|
+---+----+
| a| 5|
| b| 7|
| c| 8|
| d| 1|
+---+----+
For each row, I'm looking to replace Id column with "other" if Rank column is larger than 5.
If I use pseudocode to explain:
For row in df:
if row.Rank > 5:
then replace(row.Id, "other")
The result should look like this:
+-----+----+
| Id|Rank|
+-----+----+
| a| 5|
|other| 7|
|other| 8|
| d| 1|
+-----+----+
Any clue how to achieve this? Thanks!!!
To create this Dataframe:
df = spark.createDataFrame([('a', 5), ('b', 7), ('c', 8), ('d', 1)], ['Id', 'Rank'])
You can use when and otherwise like -
from pyspark.sql.functions import *
df\
.withColumn('Id_New',when(df.Rank <= 5,df.Id).otherwise('other'))\
.drop(df.Id)\
.select(col('Id_New').alias('Id'),col('Rank'))\
.show()
this gives output as -
+-----+----+
| Id|Rank|
+-----+----+
| a| 5|
|other| 7|
|other| 8|
| d| 1|
+-----+----+
Starting with #Pushkr solution couldn't you just use the following ?
from pyspark.sql.functions import *
df.withColumn('Id',when(df.Rank <= 5,df.Id).otherwise('other')).show()
I have the following DataFrame with columns: ["id", "timestamp", "x", "y"]:
+---+----------+---+---+
| id| timestamp| x| y|
+---+----------+---+---+
| 0|1443489380|100| 1|
| 0|1443489390|200| 0|
| 0|1443489400|300| 0|
| 1|1443489410|400| 1|
| 1|1443489550|100| 1|
| 2|1443489560|600| 0|
| 2|1443489570|200| 0|
| 2|1443489580|700| 1|
+---+----------+---+---+
I have defined the following Window:
from pyspark.sql import Window
w = Window.partitionBy("id").orderBy("timestamp")
I would like to extract only the first and last row of data in the window w. How can I accomplish this?
If you want the first and last values on the same row, one way is to use pyspark.sql.functions.first():
from pyspark.sql import Window
from pyspark.sql.functions import first
w1 = Window.partitionBy("id").orderBy("timestamp")
w2 = Window.partitionBy("id").orderBy(f.col("timestamp").desc()) # sort desc
df.select(
"id",
*([first(c).over(w1).alias("first_" + c) for c in df.columns if c != "id"] +
[first(c).over(w2).alias("last_" + c) for c in df.columns if c != "id"])
)\
.distinct()\
.show()
#+---+---------------+-------+-------+--------------+------+------+
#| id|first_timestamp|first_x|first_y|last_timestamp|last_x|last_y|
#+---+---------------+-------+-------+--------------+------+------+
#| 0| 1443489380| 100| 1| 1443489400| 300| 0|
#| 1| 1443489410| 400| 1| 1443489550| 100| 1|
#| 2| 1443489560| 600| 0| 1443489580| 700| 1|
#+---+---------------+-------+-------+--------------+------+------+
I have a DataFrame similar to this example:
enter image description here
and I want obtain new dataframe as follows:
enter image description here
update:2
import pyspark.sql.types as typ
import pyspark.sql.functions as fn
import datetime
from pyspark.sql.functions import *
labels=[('name', typ.StringType()),('month', typ.StringType()),('degree',typ.FloatType())]
schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
degree_df = spark.read.csv("file:///home/Ahmad/ahmad_tst/TEST.csv", header= False,schema=schema)
table_count_c= degree_df.stat.crosstab("name","month").withColumnRenamed('name_month','name')
table_count_d=degree_df.groupBy("name","month").agg((min("degree")),(max("degree")))
table_count_d.show()
+-----+-----+-----------+-----------+
| name|month|min(degree)|max(degree)|
+-----+-----+-----------+-----------+
|Ahmad| May| 38.0| 38.0|
|Ahmad|April| 40.0| 49.0|
| Emma| May| 45.0| 50.0|
+-----+-----+-----------+-----------+
table_count_c= degree_df.stat.crosstab("name","month").withColumnRenamed('name_month','name')
table_count_c.show()
+-----+-----+---+
| name|April|May|
+-----+-----+---+
|Ahmad| 2| 1|
| Emma| 0| 2|
+-----+-----+---+
table_4c= table_count_c.join(table_count_d, "name" , 'left_outer')
table_4c.show()
+-----+-----+---+-----+-----------+-----------+
| name|April|May|month|min(degree)|max(degree)|
+-----+-----+---+-----+-----------+-----------+
|Ahmad| 2| 1|April| 40.0| 49.0|
|Ahmad| 2| 1| May| 38.0| 38.0|
| Emma| 0| 2| May| 45.0| 50.0|
+-----+-----+---+-----+-----------+-----------+
Update :3
According to the following recommendation " you could get something similar to what you're after by performing left outer join on table_count_d with itself"
the resulting data frame as given below;
I want to obtain dataframe as follows:
+-----+-----+---+-----+-----------+-----------+-----+-----------+-----------+
| name|April|May|month|min(degree)|max(degree)|month|min(degree)|max(degree)|
+-----+-----+---+-----+-----------+-----------+-----+-----------+-----------+
|Ahmad| 2| 1| May| 38.0| 38.0|April| 40.0| 49.0|
| Emma| 0| 2| May| 45.0| 50.0|April| 00.0| 00.0|
+-----+-----+---+-----+-----------+-----------+-----+-----------+-----------+
Is there a way to do this with PySpark 2.0.1
Here are two options; the first is slightly more elegant (especially if you have more than two months), but does not yield exactly what you're after; the second does generate it, but is more verbose. (it would help if you explicitly describe the logic of what you're trying to achieve).
1. Use left outer join
The idea is as suggested above, with a condition on a unique id column to prevent the same pair from appearing twice.
import pyspark.sql.functions as func
sc = SparkContext.getOrCreate()
sql_sc = SQLContext(sc)
df1 = sql_sc.createDataFrame([("Ahmad", "May", '38.0', '38.0'), ("Ahmad", "April", '40.0', '49.0'), ("Emma", "May", '45.0', '50.0')],
("name", "month", "min(degree)", "max(degree)"))
# add a unique id column
df1 = df1.withColumn('id', func.monotonically_increasing_id())
#self join - rename columns to maintain unique column name
df2 = df1
for c in df2.columns:
df2 = df2.withColumnRenamed(c, c + '_2')
# use the id column to prevent the same pair from appearing twice
dfx = df1.join(df2, (df1['name'] == df2['name_2']) & (df1['month'] != df2['month_2']) & (df1['id'] < df2['id_2']) , 'left_outer' )
dfx.show()
Which yields:
+-----+-----+-----------+-----------+-----------+------+-------+-------------+-------------+-----------+
| name|month|min(degree)|max(degree)| id|name_2|month_2|min(degree)_2|max(degree)_2| id_2|
+-----+-----+-----------+-----------+-----------+------+-------+-------------+-------------+-----------+
|Ahmad| May| 38.0| 38.0|17179869184| Ahmad| April| 40.0| 49.0|42949672960|
|Ahmad|April| 40.0| 49.0|42949672960| null| null| null| null| null|
| Emma| May| 45.0| 50.0|60129542144| null| null| null| null| null|
+-----+-----+-----------+-----------+-----------+------+-------+-------------+-------------+-----------+
2. Split data per month
df_4 = df1.where(func.col('month') == 'April')
df_5 = df1.where(func.col('month') == 'May')
df_5.join(df_4, df_5['name'] == df_4['name'], 'outer').show()
Yielding:
+-----+-----+-----------+-----------+-----------+-----+-----+-----------+-----------+-----------+
| name|month|min(degree)|max(degree)| id| name|month|min(degree)|max(degree)| id|
+-----+-----+-----------+-----------+-----------+-----+-----+-----------+-----------+-----------+
|Ahmad| May| 38.0| 38.0|17179869184|Ahmad|April| 40.0| 49.0|42949672960|
| Emma| May| 45.0| 50.0|60129542144| null| null| null| null| null|
+-----+-----+-----------+-----------+-----------+-----+-----+-----------+-----------+-----------+
import pyspark.sql.types as typ
import pyspark.sql.functions as fn
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
labels=[('name', typ.StringType()),('month', typ.StringType()),('degree',typ.FloatType())]
schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
degree_df = spark.read.csv("file:///home/Ahmad/ahmad_tst/TEST.csv", header= False,schema=schema)
table_count_d=degree_df.groupBy("name","month").agg((min("degree")),(max("degree")))
table_count_c= degree_df.stat.crosstab("name","month").withColumnRenamed('name_month','name')
table1=table_count_c.join(table_count_d, "name" , 'left_outer')
df1 = table1.groupby('name').pivot('month').agg(fn.first('min(degree)'),fn.first('min(degree)'))
df1.show()
the resulting DF as follows :
+-----+-----+---+---------------------------------+---------------------------------+-------------------------------+-------------------------------+
| name|April|May|April_first(`min(degree)`, false)|April_first(`max(degree)`, false)|May_first(`min(degree)`, false)|May_first(`max(degree)`, false)|
+-----+-----+---+---------------------------------+---------------------------------+-------------------------------+-------------------------------+
|Ahmad| 2| 1| 40.0| 49.0| 38.0| 38.0|
| Emma| 0| 2| null| null| 45.0| 50.0|
+-----+-----+---+---------------------------------+---------------------------------+-------------------------------+-------------------------------+
After that, you can rename columns as you like