What would be a Pyspark equivalent of the SQL statement NOT IN - pyspark

What would be the equivalent code in PySpark?
If I have table A and Table B, and I want to select certain ID from Table A which is not in Table B, I can do the following SQL command:
Select ID
from Table A
where ID not in (Select ID from Table B)
What would be the equivalent code in PySpark?

You could do a "left anti-join" with the option "left_anti":
A_df.show()
# +-----+---+
# | type| id|
# +-----+---+
# |type1| 10|
# |type2| 20|
# +-----+---+
B_df.show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 1|name1| 10|
# | 2|name2| 30|
# | 3|name3| 20|
# +---+-----+----+
B_df.join(A_df, B_df.type == A_df.id, "anti").show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 2|name2| 30|
# +---+-----+----+
This would be equivalent to select * from B_df where type not in (select id from A_df)
In an SQL context (see spark sql anti-join):
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)
# register dataframe as tables in the SQL context
sqlc.registerDataFrameAsTable(A_df, "A_table")
sqlc.registerDataFrameAsTable(B_df, "B_table")
spark.sql("SELECT * FROM B_table LEFT ANTI JOIN A_table ON B_table.type == A_table.id").show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 2|name2| 30|
# +---+-----+----+
Here's how I created the dataframes:
A = [("type1",10), \
("type2",20), \
]
AColumns = ["type","id"]
A_df = spark.createDataFrame(data=A, schema = AColumns)
A_df.printSchema()
A_df.show(truncate=False)
B = [(1,"name1",10), \
(2,"name2",30), \
(3,"name3",20) \
]
BColumns = ["id","name","type"]
B_df = spark.createDataFrame(data=B, schema = BColumns)
B_df.printSchema()
B_df.show(truncate=False)

Related

filter record in dataframe base on list of value

I have below scenario.
li = ['g1','g2','g3']
df1 = id name goal
1 raj g1
2 harsh g3/g1
3 ramu g1
Above as you can see dataframe df1 and list li
i wanted to filter record in df1 base on list values of li but you can see in goal column first we need to split value base of / del but getting error
df1 = df1.filter(~df1.goal.isin(li))
but this is not returning any record...
is there any way to get record
Using this exemple:
from pyspark.sql import functions as F
from pyspark.sql.types import *
li = ['g1','g2','g3']
df1 = spark.createDataFrame(
[
('1','raj','g1'),
('2','harsh','g3/g1'),
('3','ramu','g1'),
('4','luiz','g2/g4')
],
["id", "name", "goal"]
)
df1.show()
# +---+-----+-----+
# | id| name| goal|
# +---+-----+-----+
# | 1| raj| g1|
# | 2|harsh|g3/g1|
# | 3| ramu| g1|
# | 4| luiz|g2/g4|
# +---+-----+-----+
You can use split to split the goal column and then array_except to find which records are not in your list:
result = df1\
.withColumn('goal_split', F.split(F.col('goal'), "/"))\
.withColumn('li', F.array([F.lit(x) for x in li]))\
.withColumn("test",F.array_except('goal_split','li'))\
.filter(F.col('test') == F.array([]))\
result.show()
# +---+-----+-----+----------+------------+----+
# | id| name| goal|goal_split| li|test|
# +---+-----+-----+----------+------------+----+
# | 1| raj| g1| [g1]|[g1, g2, g3]| []|
# | 2|harsh|g3/g1| [g3, g1]|[g1, g2, g3]| []|
# | 3| ramu| g1| [g1]|[g1, g2, g3]| []|
# +---+-----+-----+----------+------------+----+
Than, select the columns you want for the result:
result.select('id', 'name', 'goal').show().
# +---+-----+-----+
# | id| name| goal|
# +---+-----+-----+
# | 1| raj| g1|
# | 2|harsh|g3/g1|
# | 3| ramu| g1|
# +---+-----+-----+

pypsark convert for loop to map

I have a dataset that has null values
+----+----+----+
|col1|col2|col3|
+----+----+----+
| 1| 0|null|
| 1|null| 0|
|null| 1| 0|
| 1| 0| 0|
| 1| 0| 0|
|null| 0| 1|
| 1| 1| 0|
| 1| 1|null|
|null| 1| 0|
+----+----+----+
I wrote a function to count the percentage of null values of each column in the dataset and removing those columns from the dataset. Below is the function
import pyspark.sql.functions as F
def calc_null_percent(df, strength=None):
if strength is None:
strength = 80
total_count = df.count()
null_cols = []
df2 = df.select([F.count(F.when(F.col(c).contains('None') | \
F.col(c).contains('NULL') | \
(F.col(c) == '' ) | \
F.col(c).isNull() | \
F.isnan(c), c
)).alias(c)
for c in df.columns])
for i in df2.columns:
get_null_val = df2.first()[i]
if (get_null_val/total_count)*100 > strength:
null_cols.append(i)
df = df.drop(*null_cols)
return df
I am using a for loop to get the columns based on the condition. Can we use map or Is there any other way to optimise the for loop in pyspark?
Here's a way to do it with list comprehension.
data_ls = [
(1, 0, 'blah'),
(0, None, 'None'),
(None, 1, 'NULL'),
(1, None, None)
]
data_sdf = spark.sparkContext.parallelize(data_ls).toDF(['id1', 'id2', 'id3'])
# +----+----+----+
# | id1| id2| id3|
# +----+----+----+
# | 1| 0|blah|
# | 0|null|None|
# |null| 1|NULL|
# | 1|null|null|
# +----+----+----+
Now, calculate the percentage of nulls in a dataframe and collect() it for further use.
# total row count
tot_count = data_sdf.count()
# percentage of null records per column
data_null_perc_sdf = data_sdf. \
select(*[(func.sum((func.col(k).isNull() | (func.upper(k).isin(['NONE', 'NULL']))).cast('int')) / tot_count).alias(k+'_nulls_perc') for k in data_sdf.columns])
# +--------------+--------------+--------------+
# |id1_nulls_perc|id2_nulls_perc|id3_nulls_perc|
# +--------------+--------------+--------------+
# | 0.25| 0.5| 0.75|
# +--------------+--------------+--------------+
# collection of the dataframe for list comprehension
data_null_perc = data_null_perc_sdf.collect()
# [Row(id1_nulls_perc=0.25, id2_nulls_perc=0.5, id3_nulls_perc=0.75)]
threshold = 0.5
# retain columns of `data_sdf` that have more null records than aforementioned threshold
cols2drop = [k for k in data_sdf.columns if data_null_perc[0][k+'_nulls_perc'] >= threshold]
# ['id2', 'id3']
Use cols2drop variable to drop the columns from data_sdf in the next step
new_data_sdf = data_sdf.drop(*cols2drop)
# +----+
# | id1|
# +----+
# | 1|
# | 0|
# |null|
# | 1|
# +----+

Merge many dataframes into one in Pyspark [non pandas df]

I will be getting dataframes generated one by one through a process. I have to merge them into one.
+--------+----------+
| Name|Age |
+--------+----------+
|Alex | 30|
+--------+----------+
+--------+----------+
| Name|Age |
+--------+----------+
|Earl | 32|
+--------+----------+
+--------+----------+
| Name|Age |
+--------+----------+
|Jane | 15|
+--------+----------+
Finally:
+--------+----------+
| Name|Age |
+--------+----------+
|Alex | 30|
+--------+----------+
|Earl | 32|
+--------+----------+
|Jane | 15|
+--------+----------+
Tried many options like concat, merge, append but all are I guess pandas libraries. I am not using pandas. Using version python 2.7 and Spark 2.2
Edited to cover final scenario with foreachpartition:
l = [('Alex', 30)]
k = [('Earl', 32)]
ldf = spark.createDataFrame(l, ('Name', 'Age'))
ldf = spark.createDataFrame(k, ('Name', 'Age'))
# option 1:
union_df(ldf).show()
#option 2:
uxdf = union_df(ldf)
uxdf.show()
output in both cases:
+-------+---+
| Name|Age|
+-------+---+
|Earl | 32|
+-------+---+
You can use unionAll() for dataframes:
from functools import reduce # For Python 3.x
from pyspark.sql import DataFrame
def unionAll(*dfs):
return reduce(DataFrame.union, dfs)
df1 = sqlContext.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))
df2 = sqlContext.createDataFrame([(3, "foo2"), (4, "bar2")], ("k", "v"))
df3 = sqlContext.createDataFrame([(5, "foo3"), (6, "bar3")], ("k", "v"))
unionAll(df1, df2, df3).show()
## +---+----+
## | k| v|
## +---+----+
## | 1|foo1|
## | 2|bar1|
## | 3|foo2|
## | 4|bar2|
## | 5|foo3|
## | 6|bar3|
## +---+----+
EDIT:
You can create an empty dataframe, and keep doing a union to it:
# Create first dataframe
ldf = spark.createDataFrame(l, ["Name", "Age"])
ldf.show()
# Save it's schema
schema = ldf.schema
# Create an empty DF with the same schema, (you need to provide schema to create empty dataframe)
empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
empty_df.show()
# Union the first DF with the empty df
empty_df = empty_df.union(ldf)
empty_df.show()
# New dataframe after some operations
ldf = spark.createDataFrame(k, schema)
# Union with the empty_df again
empty_df = empty_df.union(ldf)
empty_df.show()
# First DF ldf
+----+---+
|Name|Age|
+----+---+
|Alex| 30|
+----+---+
# Empty dataframe empty_df
+----+---+
|Name|Age|
+----+---+
+----+---+
# After first union empty_df.union(ldf)
+----+---+
|Name|Age|
+----+---+
|Alex| 30|
+----+---+
# After second union with new ldf
+----+---+
|Name|Age|
+----+---+
|Alex| 30|
|Earl| 32|
+----+---+

HI,Could you please help me resolving Issue while creating new column in Pyspark: I explained the issue as below:

query I'm using:
I want to replace existing columns with new values on condition, if value of another col = ABC then column remain same otherwise should give null or blank.
It's giving result as per logic but only for last column it encounters in loop.
import pyspark.sql.functions as F
for i in df.columns:
if i[4:]!='ff':
new_df=df.withColumn(i,F.when(df.col_ff=="abc",df[i])\
.otherwise(None))
df:
+------+----+-----+-------+
| col1 |col2|col3 | col_ff|
+------+----+-----+-------+
| a | a | d | abc |
| a | b | c | def |
| b | c | b | abc |
| c | d | a | def |
+------+----+-----+-------+
required output:
+------+----+-----+-------+
| col1 |col2|col3 | col_ff|
+------+----+-----+-------+
| a | a | d | abc |
| null |null|null | def |
| b | c | b | abc |
| null |null|null | def |
+------+----+-----+-------+
The problem in your code is that you're overwriting new_df with the original DataFrame df in each iteration of the loop. You can fix it by first setting new_df = df outside of the loop, and then performing the withColumn operations on new_df inside the loop.
For example, if df were the following:
df.show()
#+----+----+----+------+
#|col1|col2|col3|col_ff|
#+----+----+----+------+
#| a| a| d| abc|
#| a| b| c| def|
#| b| c| b| abc|
#| c| d| a| def|
#+----+----+----+------+
Change your code to:
import pyspark.sql.functions as F
new_df = df
for i in df.columns:
if i[4:]!='ff':
new_df = new_df.withColumn(i, F.when(F.col("col_ff")=="abc", F.col(i)))
Notice here that I removed the .otherwise(None) part because when will return null by default if the condition is not met.
You could also do the same using functools.reduce:
from functools import reduce # for python3
new_df = reduce(
lambda df, i: df.withColumn(i, F.when(F.col("col_ff")=="abc", F.col(i))),
[i for i in df.columns if i[4:] != "ff"],
df
)
In both cases the result is the same:
new_df.show()
#+----+----+----+------+
#|col1|col2|col3|col_ff|
#+----+----+----+------+
#| a| a| d| abc|
#|null|null|null| def|
#| b| c| b| abc|
#|null|null|null| def|
#+----+----+----+------+

Text file comparison using Spark data frames

I would like to implement below requirement using Spark dataframes to compare 2 text/csv
List item
files. Ideally, File1.txt should compare with File2.txt and result should be in other txt file with flag as (SAME/UPDATE/INSERT/DELETE).
UPDATE - if any record values are updated in file2 when compared to file1
INSERT - if a new record exist in file2
DELETE - only if the record exist in file1 (not in file2)
SAME - if same record exist in both files
File1.txt
NO DEPT NAME SAL
1 IT RAM 1000
2 IT SRI 600
3 HR GOPI 1500
5 HW MAHI 700
File2.txt
NO DEPT NAME SAL
1 IT RAM 1000
2 IT SRI 900
4 MT SUMP 1200
5 HW MAHI 700
Outputfile.txt
NO DEPT NAME SAL FLAG
1 IT RAM 1000 S
2 IT SRI 900 U
4 MT SUMP 1200 I
5 HW MAHI 700 S
3 HR GOPI 1500 D
So far, i did below coding. But not able to proceed further. Pls help.
from pyspark.shell import spark
sc = spark.sparkContext
df1 = spark.read.option("header","true").option("delimiter", ",").csv("C:\\inputs\\file1.csv")
df2 = spark.read.option("header","true").option("delimiter", ",").csv("C:\\inputs\\file2.csv")
df1.createOrReplaceTempView("table1")
df2.createOrReplaceTempView("table2")
sqlDF1 = spark.sql( "select * from table1" )
sqlDF2 = spark.sql( "select * from table2" )
leftJoinDF = sqlDF1.join(sqlDF2, 'id', how='left')
rightJoinDF = sqlDF1.join(sqlDF2, 'id', how='right')
innerJoinDF = sqlDF1.join(sqlDF2, 'id')
Is there any way if we merge the data, after performing leftJoin, rightJoin, innerJoin. With this whether i could get desired output or any other way.
Thanks,
You can find my solution below. I create 4 dataframe for SAME/UPDATE/INSERT/DELETE cases and then union them
>>> from functools import reduce
>>> from pyspark.sql import DataFrame
>>> import pyspark.sql.functions as F
>>> df1 = sc.parallelize([
... (1,'IT','RAM',1000),
... (2,'IT','SRI',600),
... (3,'HR','GOPI',1500),
... (5,'HW','MAHI',700)
... ]).toDF(['NO','DEPT','NAME','SAL'])
>>> df1.show()
+---+----+----+----+
| NO|DEPT|NAME| SAL|
+---+----+----+----+
| 1| IT| RAM|1000|
| 2| IT| SRI| 600|
| 3| HR|GOPI|1500|
| 5| HW|MAHI| 700|
+---+----+----+----+
>>> df2 = sc.parallelize([
... (1,'IT','RAM',1000),
... (2,'IT','SRI',900),
... (4,'MT','SUMP',1200),
... (5,'HW','MAHI',700)
... ]).toDF(['NO','DEPT','NAME','SAL'])
>>> df2.show()
+---+----+----+----+
| NO|DEPT|NAME| SAL|
+---+----+----+----+
| 1| IT| RAM|1000|
| 2| IT| SRI| 900|
| 4| MT|SUMP|1200|
| 5| HW|MAHI| 700|
+---+----+----+----+
#DELETE
>>> df_d = df1.join(df2, df1.NO == df2.NO, 'left').filter(F.isnull(df2.NO)).select(df1.NO,df1.DEPT,df1.NAME,df1.SAL, F.lit('D').alias('FLAG'))
#INSERT
>>> df_i = df1.join(df2, df1.NO == df2.NO, 'right').filter(F.isnull(df1.NO)).select(df2.NO,df2.DEPT,df2.NAME,df2.SAL, F.lit('I').alias('FLAG'))
#SAME/
>>> df_s = df1.join(df2, df1.NO == df2.NO, 'inner').filter(F.concat(df2.NO,df2.DEPT,df2.NAME,df2.SAL) == F.concat(df1.NO,df1.DEPT,df1.NAME,df1.SAL)).\
... select(df1.NO,df1.DEPT,df1.NAME,df1.SAL, F.lit('S').alias('FLAG'))
#UPDATE
>>> df_u = df1.join(df2, df1.NO == df2.NO, 'inner').filter(F.concat(df2.NO,df2.DEPT,df2.NAME,df2.SAL) != F.concat(df1.NO,df1.DEPT,df1.NAME,df1.SAL)).\
... select(df2.NO,df2.DEPT,df2.NAME,df2.SAL, F.lit('U').alias('FLAG'))
>>> dfs = [df_s,df_u,df_u,df_i]
>>> df = reduce(DataFrame.unionAll, dfs)
>>>
>>> df.show()
+---+----+----+----+----+
| NO|DEPT|NAME| SAL|FLAG|
+---+----+----+----+----+
| 5| HW|MAHI| 700| S|
| 1| IT| RAM|1000| S|
| 2| IT| SRI| 900| U|
| 2| IT| SRI| 900| U|
| 4| MT|SUMP|1200| I|
+---+----+----+----+----+
You can use 'outer' join after concatenating all columns first. Then create an udf for flags.
import pyspark.sql.functions as F
df = sql.createDataFrame([
(1,'IT','RAM',1000),
(2,'IT','SRI',600),
(3,'HR','GOPI',1500),
(5,'HW','MAHI',700)],
['NO' ,'DEPT', 'NAME', 'SAL' ])
df1 = sql.createDataFrame([
(1,'IT','RAM',1000),
(2,'IT','SRI',900),
(4,'MT','SUMP',1200 ),
(5,'HW','MAHI',700)],
['NO' ,'DEPT', 'NAME', 'SAL' ])
def flags(x,y):
if not x:
return y+'-I'
if not y:
return x+'-D'
if x == y:
return x+'-S'
return y+'-U'
_cols = df.columns
flag_udf = F.udf(lambda x,y: flags(x,y),StringType())
df = df.select(['NO']+ [F.concat_ws('-', *[F.col(_c) for _c in df.columns]).alias('f1')])\
.join(df1.select(['NO']+ [F.concat_ws('-', *[F.col(_c1) for _c1 in df1.columns]).alias('f2')]), 'NO', 'outer')\
.select(flag_udf('f1','f2').alias('combined'))
df.show()
The result will be,
+----------------+
| combined|
+----------------+
| 5-HW-MAHI-700-S|
| 1-IT-RAM-1000-S|
|3-HR-GOPI-1500-D|
| 2-IT-SRI-900-U|
|4-MT-SUMP-1200-I|
+----------------+
Finally, split the combined column.
split_col = F.split(df['combined'], '-')
df = df.select([split_col.getItem(i).alias(s) for i,s in enumerate(_cols+['FLAG'])])
df.show()
You get the desired output,
+---+----+----+----+----+
| NO|DEPT|NAME| SAL|FLAG|
+---+----+----+----+----+
| 5| HW|MAHI| 700| S|
| 1| IT| RAM|1000| S|
| 3| HR|GOPI|1500| D|
| 2| IT| SRI| 900| U|
| 4| MT|SUMP|1200| I|
+---+----+----+----+----+