Pivot Multiple columns pyspark - pyspark

Hello I am trying to pivot a data table similar to the table below and put the trouble code values and trouble code status into columns and group by job #
Source Table
Desired output
I've tried following the example below with the following code
How to pivot on multiple columns in Spark SQL?
trouble_df = mydf.withColumn('combcol',F.concat(F.lit('trouble_code_'),mydf['trouble_code'])).groupby('Job #').pivot('combcol').agg(F.first('trouble_status'))
Below is the output from the code which isnt exactly what i'm looking. Fairly new to pyspark so still learning
Thank you for the help!

df1 = (
# Collapse columns into rows
df.withColumn('tab', F.array(*[F.struct(F.lit(x).alias('y'), F.col(x).alias('z')) for x in df.columns if x!='job'])).selectExpr('*','inline(tab)').drop('tab')
#Create new column names for pivot
.withColumn('y', concat_ws('_',col('y'),dense_rank().over( Window.partitionBy('job').orderBy('job','trouble_code')).cast('string')))
#Pivot
.groupby('job').pivot('y').agg(F.first('z'))
)
+---+-------------+-------------+-------------+----------------+----------------+----------------+
|job|trouble_code_1|trouble_code_2|trouble_code_3|trouble_status_1|trouble_status_2|trouble_status_3|
+---+-------------+-------------+-------------+----------------+----------------+----------------+
|xxx| aa| bb| cc| open| open| closed|
|yyy| aa| bb| cc| closed| open| open|
+---+-------------+-------------+-------------+----------------+----------------+----------------+

You can create a column with the row numbers (i.e. 1, 2, 3 for each record). Use this column as a pivot column with 2 aggregations - one for trouble_code and one for trouble_status.
pivot_data_sdf = data_sdf. \
withColumn('rn',
func.row_number().over(wd.partitionBy('job').orderBy(func.lit(1)))
). \
groupBy('job'). \
pivot('rn'). \
agg(func.first('trouble_code').alias('trouble_code'),
func.first('trouble_status').alias('trouble_status')
)
# +----+--------------+----------------+--------------+----------------+--------------+----------------+
# | job|1_trouble_code|1_trouble_status|2_trouble_code|2_trouble_status|3_trouble_code|3_trouble_status|
# +----+--------------+----------------+--------------+----------------+--------------+----------------+
# |yyyy| aa| close| bb| open| cc| open|
# |xxxx| aa| open| bb| open| cc| close|
# +----+--------------+----------------+--------------+----------------+--------------+----------------+
Just rename the trouble_* columns.
# function takes column name and renames it with the number at the end
def col_rename(sdfcolname):
colsplit = sdfcolname.split('_')
rearr_colsplit = colsplit[1:3] + [colsplit[0]]
new_sdfcolname = '_'.join(rearr_colsplit)
return new_sdfcolname
pivot_data_sdf. \
select(*[func.col(k).alias(col_rename(k)) if 'trouble'in k else k for k in pivot_data_sdf.columns]). \
show()
# +----+--------------+----------------+--------------+----------------+--------------+----------------+
# | job|trouble_code_1|trouble_status_1|trouble_code_2|trouble_status_2|trouble_code_3|trouble_status_3|
# +----+--------------+----------------+--------------+----------------+--------------+----------------+
# |yyyy| aa| close| bb| open| cc| open|
# |xxxx| aa| open| bb| open| cc| close|
# +----+--------------+----------------+--------------+----------------+--------------+----------------+

Related

filter record in dataframe base on list of value

I have below scenario.
li = ['g1','g2','g3']
df1 = id name goal
1 raj g1
2 harsh g3/g1
3 ramu g1
Above as you can see dataframe df1 and list li
i wanted to filter record in df1 base on list values of li but you can see in goal column first we need to split value base of / del but getting error
df1 = df1.filter(~df1.goal.isin(li))
but this is not returning any record...
is there any way to get record
Using this exemple:
from pyspark.sql import functions as F
from pyspark.sql.types import *
li = ['g1','g2','g3']
df1 = spark.createDataFrame(
[
('1','raj','g1'),
('2','harsh','g3/g1'),
('3','ramu','g1'),
('4','luiz','g2/g4')
],
["id", "name", "goal"]
)
df1.show()
# +---+-----+-----+
# | id| name| goal|
# +---+-----+-----+
# | 1| raj| g1|
# | 2|harsh|g3/g1|
# | 3| ramu| g1|
# | 4| luiz|g2/g4|
# +---+-----+-----+
You can use split to split the goal column and then array_except to find which records are not in your list:
result = df1\
.withColumn('goal_split', F.split(F.col('goal'), "/"))\
.withColumn('li', F.array([F.lit(x) for x in li]))\
.withColumn("test",F.array_except('goal_split','li'))\
.filter(F.col('test') == F.array([]))\
result.show()
# +---+-----+-----+----------+------------+----+
# | id| name| goal|goal_split| li|test|
# +---+-----+-----+----------+------------+----+
# | 1| raj| g1| [g1]|[g1, g2, g3]| []|
# | 2|harsh|g3/g1| [g3, g1]|[g1, g2, g3]| []|
# | 3| ramu| g1| [g1]|[g1, g2, g3]| []|
# +---+-----+-----+----------+------------+----+
Than, select the columns you want for the result:
result.select('id', 'name', 'goal').show().
# +---+-----+-----+
# | id| name| goal|
# +---+-----+-----+
# | 1| raj| g1|
# | 2|harsh|g3/g1|
# | 3| ramu| g1|
# +---+-----+-----+

Splitting annual data into quarters in pyspark df

I have a df that looks like below. I need to break down annual data into quarters, so for each company for each year create a new row with quarter date and new EV (simply divide annual value by 4). Any suggestions how to do it?
+----------+------+---+
| date|entity| EV|
+----------+------+---+
|2018-12-31| x| 40|
|2019-12-31| x| 80|
|2018-12-31| y|120|
+----------+------+---+
Expected output:
+----------+------+---+
| date|entity| EV|
+----------+------+---+
|2018-03-31| x| 10|
|2018-06-30| x| 10|
|2018-09-30| x| 10|
|2018-12-31| x| 10|
|2019-03-31| x| 20|
|2019-06-30| x| 20|
|2019-09-30| x| 20|
|2019-12-31| x| 20|
|2018-03-31| y| 30|
|2018-06-30| y| 30|
|2018-09-30| y| 30|
|2018-12-31| y| 30|
+----------+------+---+
Here's one way to do it using arrays and transform.
data_sdf. \
withColumn('qtr_dt_suffix',
func.array(func.lit('03-31'), func.lit('06-30'), func.lit('09-30'), func.lit('12-31'))
). \
withColumn('qtr_dts',
func.transform('qtr_dt_suffix', lambda x: func.concat(func.year('dt'), func.lit('-'), x).cast('date'))
). \
select(func.explode('qtr_dts').alias('qtr_dt'), 'entity', (func.col('ev') / 4).alias('ev')). \
show()
# +----------+------+----+
# |qtr_dt |entity|ev |
# +----------+------+----+
# |2018-03-31|x |10.0|
# |2018-06-30|x |10.0|
# |2018-09-30|x |10.0|
# |2018-12-31|x |10.0|
# |2019-03-31|x |20.0|
# |2019-06-30|x |20.0|
# |2019-09-30|x |20.0|
# |2019-12-31|x |20.0|
# |2018-03-31|y |30.0|
# |2018-06-30|y |30.0|
# |2018-09-30|y |30.0|
# |2018-12-31|y |30.0|
# +----------+------+----+
Idea is to create an array containing all the quarter ending months and their end dates - [03-31, 06-30, 09-30, 12-31]. Use transform on this array to create dates for that year - [2018-03-31, 2018-06-30, 2018-09-30, 2018-12-31]. Explode this resulting array to create rows for each quarter dates.
If transform is not available in your spark version, you can use transform in expr.
data_sdf. \
withColumn('qtr_dt_suffix',
func.array(func.lit('03-31'), func.lit('06-30'), func.lit('09-30'), func.lit('12-31'))
). \
withColumn('qtr_dts',
func.expr('transform(qtr_dt_suffix, x -> cast(concat(year(dt), "-", x) as date))')
). \
show(truncate=False)

How to find the intersection and symmetric difference of the same ID in two dataframes using pyspark

How to find the intersection and symmetric difference of the same ID in two dataframes using pyspark.
enter image description here
thanks~
Try using semi and anti joins. See below code implementation for details -
Input dataframes -
df1 = spark.createDataFrame(data=[(100,1,115),(200,2,286),(300,3,72),(400,4,819)], schema= ["userId", "movieId", "score"])
df2 = spark.createDataFrame(data=[(100,1,115),(200,2,286),(500,3,72),(600,4,819)], schema= ["userId", "movieId", "score"])
Resultant Dataframes: result1-
result1 = df1.join(df2, df1["userId"] == df2["userId"], how ="semi")
result1.show()
+------+-------+-----+
|userId|movieId|score|
+------+-------+-----+
| 100| 1| 115|
| 200| 2| 286|
+------+-------+-----+
Resultant Dataframes: result2-
result21 = df1.join(df2, df1["userId"] == df2["userId"], how ="anti")
result22 = df2.join(df1, df2["userId"] == df1["userId"], how ="anti")
result2 = result21.unionAll(result22)
result2.show()
+------+-------+-----+
|userId|movieId|score|
+------+-------+-----+
| 300| 3| 72|
| 400| 4| 819|
| 500| 3| 72|
| 600| 4| 819|
+------+-------+-----+
You can go through below official Spark for details about both join types to have a better idea -
Spark Semi Join
Spark Anti Join
Join depends on the columns you want to join for. If you are interested in the intersection and difference, substract and intersect are safer because they take into consideration the row in all the columns of both DataFrames
df1.intersect(df2).show()
# +------+-------+-----+
# |userId|movieId|score|
# +------+-------+-----+
# | 100| 1| 115|
# | 200| 2| 286|
# +------+-------+-----+
df1.subtract(df2).union(df2.subtract(df1)).sort("userId").show()
# +------+-------+-----+
# |userId|movieId|score|
# +------+-------+-----+
# | 300| 3| 72|
# | 400| 4| 819|
# | 500| 3| 72|
# | 600| 4| 819|
# +------+-------+-----+

What would be a Pyspark equivalent of the SQL statement NOT IN

What would be the equivalent code in PySpark?
If I have table A and Table B, and I want to select certain ID from Table A which is not in Table B, I can do the following SQL command:
Select ID
from Table A
where ID not in (Select ID from Table B)
What would be the equivalent code in PySpark?
You could do a "left anti-join" with the option "left_anti":
A_df.show()
# +-----+---+
# | type| id|
# +-----+---+
# |type1| 10|
# |type2| 20|
# +-----+---+
B_df.show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 1|name1| 10|
# | 2|name2| 30|
# | 3|name3| 20|
# +---+-----+----+
B_df.join(A_df, B_df.type == A_df.id, "anti").show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 2|name2| 30|
# +---+-----+----+
This would be equivalent to select * from B_df where type not in (select id from A_df)
In an SQL context (see spark sql anti-join):
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)
# register dataframe as tables in the SQL context
sqlc.registerDataFrameAsTable(A_df, "A_table")
sqlc.registerDataFrameAsTable(B_df, "B_table")
spark.sql("SELECT * FROM B_table LEFT ANTI JOIN A_table ON B_table.type == A_table.id").show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 2|name2| 30|
# +---+-----+----+
Here's how I created the dataframes:
A = [("type1",10), \
("type2",20), \
]
AColumns = ["type","id"]
A_df = spark.createDataFrame(data=A, schema = AColumns)
A_df.printSchema()
A_df.show(truncate=False)
B = [(1,"name1",10), \
(2,"name2",30), \
(3,"name3",20) \
]
BColumns = ["id","name","type"]
B_df = spark.createDataFrame(data=B, schema = BColumns)
B_df.printSchema()
B_df.show(truncate=False)

Unsure how to apply row-wise normalization on pyspark dataframe

Disclaimer: I'm a beginner when it comes to Pyspark.
For each cell in a row, I'd like to apply the following function
new_col_i = col_i / max(col_1,col_2,col_3,...,col_n)
At the very end, I'd like the range of values to go from 0.0 to 1.0.
Here are the details of my dataframe:
Dimensions: (6.5M, 2905)
Dtypes: Double
Initial DF:
+-----+-------+-------+-------+
|. id| col_1| col_2| col_n |
+-----+-------+-------+-------+
| 1| 7.5| 0.1| 2.0|
| 2| 0.3| 3.5| 10.5|
+-----+-------+-------+-------+
Updated DF:
+-----+-------+-------+-------+
|. id| col_1| col_2| col_n |
+-----+-------+-------+-------+
| 1| 1.0| 0.013| 0.26|
| 2| 0.028| 0.33| 1.0|
+-----+-------+-------+-------+
Any help would be appreciated.
You can find the maximum value from an array of columns and loop your dataframe to replace the normalized column value.
cols = df.columns[1:]
import builtins as p
df2 = df.withColumn('max', array_max(array(*[col(c) for c in cols]))) \
for c in cols:
df2 = df2.withColumn(c, col(c) / col('max'))
df2.show()
+---+-------------------+--------------------+-------------------+----+
| id| col_1| col_2| col_n| max|
+---+-------------------+--------------------+-------------------+----+
| 1| 1.0|0.013333333333333334|0.26666666666666666| 7.5|
| 2|0.02857142857142857| 0.3333333333333333| 1.0|10.5|
+---+-------------------+--------------------+-------------------+----+