What would be a Pyspark equivalent of the SQL statement NOT IN

What would be a Pyspark equivalent of the SQL statement NOT IN - pyspark

What would be the equivalent code in PySpark?
If I have table A and Table B, and I want to select certain ID from Table A which is not in Table B, I can do the following SQL command:
Select ID
from Table A
where ID not in (Select ID from Table B)
What would be the equivalent code in PySpark?

You could do a "left anti-join" with the option "left_anti":
A_df.show()
# +-----+---+
# | type| id|
# +-----+---+
# |type1| 10|
# |type2| 20|
# +-----+---+
B_df.show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 1|name1| 10|
# | 2|name2| 30|
# | 3|name3| 20|
# +---+-----+----+
B_df.join(A_df, B_df.type == A_df.id, "anti").show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 2|name2| 30|
# +---+-----+----+
This would be equivalent to select * from B_df where type not in (select id from A_df)
In an SQL context (see spark sql anti-join):
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)
# register dataframe as tables in the SQL context
sqlc.registerDataFrameAsTable(A_df, "A_table")
sqlc.registerDataFrameAsTable(B_df, "B_table")
spark.sql("SELECT * FROM B_table LEFT ANTI JOIN A_table ON B_table.type == A_table.id").show()
# +---+-----+----+
# | id| name|type|
# +---+-----+----+
# | 2|name2| 30|
# +---+-----+----+
Here's how I created the dataframes:
A = [("type1",10), \
("type2",20), \
]
AColumns = ["type","id"]
A_df = spark.createDataFrame(data=A, schema = AColumns)
A_df.printSchema()
A_df.show(truncate=False)
B = [(1,"name1",10), \
(2,"name2",30), \
(3,"name3",20) \
]
BColumns = ["id","name","type"]
B_df = spark.createDataFrame(data=B, schema = BColumns)
B_df.printSchema()
B_df.show(truncate=False)

Related

filter record in dataframe base on list of value

I have below scenario.
li = ['g1','g2','g3']
df1 = id name goal
1 raj g1
2 harsh g3/g1
3 ramu g1
Above as you can see dataframe df1 and list li
i wanted to filter record in df1 base on list values of li but you can see in goal column first we need to split value base of / del but getting error
df1 = df1.filter(~df1.goal.isin(li))
but this is not returning any record...
is there any way to get record

Using this exemple:
from pyspark.sql import functions as F
from pyspark.sql.types import *
li = ['g1','g2','g3']
df1 = spark.createDataFrame(
[
('1','raj','g1'),
('2','harsh','g3/g1'),
('3','ramu','g1'),
('4','luiz','g2/g4')
],
["id", "name", "goal"]
)
df1.show()
# +---+-----+-----+
# | id| name| goal|
# +---+-----+-----+
# | 1| raj| g1|
# | 2|harsh|g3/g1|
# | 3| ramu| g1|
# | 4| luiz|g2/g4|
# +---+-----+-----+
You can use split to split the goal column and then array_except to find which records are not in your list:
result = df1\
.withColumn('goal_split', F.split(F.col('goal'), "/"))\
.withColumn('li', F.array([F.lit(x) for x in li]))\
.withColumn("test",F.array_except('goal_split','li'))\
.filter(F.col('test') == F.array([]))\
result.show()
# +---+-----+-----+----------+------------+----+
# | id| name| goal|goal_split| li|test|
# +---+-----+-----+----------+------------+----+
# | 1| raj| g1| [g1]|[g1, g2, g3]| []|
# | 2|harsh|g3/g1| [g3, g1]|[g1, g2, g3]| []|
# | 3| ramu| g1| [g1]|[g1, g2, g3]| []|
# +---+-----+-----+----------+------------+----+
Than, select the columns you want for the result:
result.select('id', 'name', 'goal').show().
# +---+-----+-----+
# | id| name| goal|
# +---+-----+-----+
# | 1| raj| g1|
# | 2|harsh|g3/g1|
# | 3| ramu| g1|
# +---+-----+-----+

pypsark convert for loop to map

I have a dataset that has null values
+----+----+----+
|col1|col2|col3|
+----+----+----+
| 1| 0|null|
| 1|null| 0|
|null| 1| 0|
| 1| 0| 0|
| 1| 0| 0|
|null| 0| 1|
| 1| 1| 0|
| 1| 1|null|
|null| 1| 0|
+----+----+----+
I wrote a function to count the percentage of null values of each column in the dataset and removing those columns from the dataset. Below is the function
import pyspark.sql.functions as F
def calc_null_percent(df, strength=None):
if strength is None:
strength = 80
total_count = df.count()
null_cols = []
df2 = df.select([F.count(F.when(F.col(c).contains('None') | \
F.col(c).contains('NULL') | \
(F.col(c) == '' ) | \
F.col(c).isNull() | \
F.isnan(c), c
)).alias(c)
for c in df.columns])
for i in df2.columns:
get_null_val = df2.first()[i]
if (get_null_val/total_count)*100 > strength:
null_cols.append(i)
df = df.drop(*null_cols)
return df
I am using a for loop to get the columns based on the condition. Can we use map or Is there any other way to optimise the for loop in pyspark?

Here's a way to do it with list comprehension.
data_ls = [
(1, 0, 'blah'),
(0, None, 'None'),
(None, 1, 'NULL'),
(1, None, None)
]
data_sdf = spark.sparkContext.parallelize(data_ls).toDF(['id1', 'id2', 'id3'])
# +----+----+----+
# | id1| id2| id3|
# +----+----+----+
# | 1| 0|blah|
# | 0|null|None|
# |null| 1|NULL|
# | 1|null|null|
# +----+----+----+
Now, calculate the percentage of nulls in a dataframe and collect() it for further use.
# total row count
tot_count = data_sdf.count()
# percentage of null records per column
data_null_perc_sdf = data_sdf. \
select(*[(func.sum((func.col(k).isNull() | (func.upper(k).isin(['NONE', 'NULL']))).cast('int')) / tot_count).alias(k+'_nulls_perc') for k in data_sdf.columns])
# +--------------+--------------+--------------+
# |id1_nulls_perc|id2_nulls_perc|id3_nulls_perc|
# +--------------+--------------+--------------+
# | 0.25| 0.5| 0.75|
# +--------------+--------------+--------------+
# collection of the dataframe for list comprehension
data_null_perc = data_null_perc_sdf.collect()
# [Row(id1_nulls_perc=0.25, id2_nulls_perc=0.5, id3_nulls_perc=0.75)]
threshold = 0.5
# retain columns of `data_sdf` that have more null records than aforementioned threshold
cols2drop = [k for k in data_sdf.columns if data_null_perc[0][k+'_nulls_perc'] >= threshold]
# ['id2', 'id3']
Use cols2drop variable to drop the columns from data_sdf in the next step
new_data_sdf = data_sdf.drop(*cols2drop)
# +----+
# | id1|
# +----+
# | 1|
# | 0|
# |null|
# | 1|
# +----+

Merge many dataframes into one in Pyspark [non pandas df]

I will be getting dataframes generated one by one through a process. I have to merge them into one.
+--------+----------+
| Name|Age |
+--------+----------+
|Alex | 30|
+--------+----------+
+--------+----------+
| Name|Age |
+--------+----------+
|Earl | 32|
+--------+----------+
+--------+----------+
| Name|Age |
+--------+----------+
|Jane | 15|
+--------+----------+
Finally:
+--------+----------+
| Name|Age |
+--------+----------+
|Alex | 30|
+--------+----------+
|Earl | 32|
+--------+----------+
|Jane | 15|
+--------+----------+
Tried many options like concat, merge, append but all are I guess pandas libraries. I am not using pandas. Using version python 2.7 and Spark 2.2
Edited to cover final scenario with foreachpartition:
l = [('Alex', 30)]
k = [('Earl', 32)]
ldf = spark.createDataFrame(l, ('Name', 'Age'))
ldf = spark.createDataFrame(k, ('Name', 'Age'))
# option 1:
union_df(ldf).show()
#option 2:
uxdf = union_df(ldf)
uxdf.show()
output in both cases:
+-------+---+
| Name|Age|
+-------+---+
|Earl | 32|
+-------+---+

You can use unionAll() for dataframes:
from functools import reduce # For Python 3.x
from pyspark.sql import DataFrame
def unionAll(*dfs):
return reduce(DataFrame.union, dfs)
df1 = sqlContext.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))
df2 = sqlContext.createDataFrame([(3, "foo2"), (4, "bar2")], ("k", "v"))
df3 = sqlContext.createDataFrame([(5, "foo3"), (6, "bar3")], ("k", "v"))
unionAll(df1, df2, df3).show()
## +---+----+
## | k| v|
## +---+----+
## | 1|foo1|
## | 2|bar1|
## | 3|foo2|
## | 4|bar2|
## | 5|foo3|
## | 6|bar3|
## +---+----+
EDIT:
You can create an empty dataframe, and keep doing a union to it:
# Create first dataframe
ldf = spark.createDataFrame(l, ["Name", "Age"])
ldf.show()
# Save it's schema
schema = ldf.schema
# Create an empty DF with the same schema, (you need to provide schema to create empty dataframe)
empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
empty_df.show()
# Union the first DF with the empty df
empty_df = empty_df.union(ldf)
empty_df.show()
# New dataframe after some operations
ldf = spark.createDataFrame(k, schema)
# Union with the empty_df again
empty_df = empty_df.union(ldf)
empty_df.show()
# First DF ldf
+----+---+
|Name|Age|
+----+---+
|Alex| 30|
+----+---+
# Empty dataframe empty_df
+----+---+
|Name|Age|
+----+---+
+----+---+
# After first union empty_df.union(ldf)
+----+---+
|Name|Age|
+----+---+
|Alex| 30|
+----+---+
# After second union with new ldf
+----+---+
|Name|Age|
+----+---+
|Alex| 30|
|Earl| 32|
+----+---+

HI,Could you please help me resolving Issue while creating new column in Pyspark: I explained the issue as below:

query I'm using:
I want to replace existing columns with new values on condition, if value of another col = ABC then column remain same otherwise should give null or blank.
It's giving result as per logic but only for last column it encounters in loop.
import pyspark.sql.functions as F
for i in df.columns:
if i[4:]!='ff':
new_df=df.withColumn(i,F.when(df.col_ff=="abc",df[i])\
.otherwise(None))
df:
+------+----+-----+-------+
| col1 |col2|col3 | col_ff|
+------+----+-----+-------+
| a | a | d | abc |
| a | b | c | def |
| b | c | b | abc |
| c | d | a | def |
+------+----+-----+-------+
required output:
+------+----+-----+-------+
| col1 |col2|col3 | col_ff|
+------+----+-----+-------+
| a | a | d | abc |
| null |null|null | def |
| b | c | b | abc |
| null |null|null | def |
+------+----+-----+-------+

The problem in your code is that you're overwriting new_df with the original DataFrame df in each iteration of the loop. You can fix it by first setting new_df = df outside of the loop, and then performing the withColumn operations on new_df inside the loop.
For example, if df were the following:
df.show()
#+----+----+----+------+
#|col1|col2|col3|col_ff|
#+----+----+----+------+
#| a| a| d| abc|
#| a| b| c| def|
#| b| c| b| abc|
#| c| d| a| def|
#+----+----+----+------+
Change your code to:
import pyspark.sql.functions as F
new_df = df
for i in df.columns:
if i[4:]!='ff':
new_df = new_df.withColumn(i, F.when(F.col("col_ff")=="abc", F.col(i)))
Notice here that I removed the .otherwise(None) part because when will return null by default if the condition is not met.
You could also do the same using functools.reduce:
from functools import reduce # for python3
new_df = reduce(
lambda df, i: df.withColumn(i, F.when(F.col("col_ff")=="abc", F.col(i))),
[i for i in df.columns if i[4:] != "ff"],
df
)
In both cases the result is the same:
new_df.show()
#+----+----+----+------+
#|col1|col2|col3|col_ff|
#+----+----+----+------+
#| a| a| d| abc|
#|null|null|null| def|
#| b| c| b| abc|
#|null|null|null| def|
#+----+----+----+------+

Text file comparison using Spark data frames

I would like to implement below requirement using Spark dataframes to compare 2 text/csv
List item
files. Ideally, File1.txt should compare with File2.txt and result should be in other txt file with flag as (SAME/UPDATE/INSERT/DELETE).
UPDATE - if any record values are updated in file2 when compared to file1
INSERT - if a new record exist in file2
DELETE - only if the record exist in file1 (not in file2)
SAME - if same record exist in both files
File1.txt
NO DEPT NAME SAL
1 IT RAM 1000
2 IT SRI 600
3 HR GOPI 1500
5 HW MAHI 700
File2.txt
NO DEPT NAME SAL
1 IT RAM 1000
2 IT SRI 900
4 MT SUMP 1200
5 HW MAHI 700
Outputfile.txt
NO DEPT NAME SAL FLAG
1 IT RAM 1000 S
2 IT SRI 900 U
4 MT SUMP 1200 I
5 HW MAHI 700 S
3 HR GOPI 1500 D
So far, i did below coding. But not able to proceed further. Pls help.
from pyspark.shell import spark
sc = spark.sparkContext
df1 = spark.read.option("header","true").option("delimiter", ",").csv("C:\\inputs\\file1.csv")
df2 = spark.read.option("header","true").option("delimiter", ",").csv("C:\\inputs\\file2.csv")
df1.createOrReplaceTempView("table1")
df2.createOrReplaceTempView("table2")
sqlDF1 = spark.sql( "select * from table1" )
sqlDF2 = spark.sql( "select * from table2" )
leftJoinDF = sqlDF1.join(sqlDF2, 'id', how='left')
rightJoinDF = sqlDF1.join(sqlDF2, 'id', how='right')
innerJoinDF = sqlDF1.join(sqlDF2, 'id')
Is there any way if we merge the data, after performing leftJoin, rightJoin, innerJoin. With this whether i could get desired output or any other way.
Thanks,

You can find my solution below. I create 4 dataframe for SAME/UPDATE/INSERT/DELETE cases and then union them
>>> from functools import reduce
>>> from pyspark.sql import DataFrame
>>> import pyspark.sql.functions as F
>>> df1 = sc.parallelize([
... (1,'IT','RAM',1000),
... (2,'IT','SRI',600),
... (3,'HR','GOPI',1500),
... (5,'HW','MAHI',700)
... ]).toDF(['NO','DEPT','NAME','SAL'])
>>> df1.show()
+---+----+----+----+
| NO|DEPT|NAME| SAL|
+---+----+----+----+
| 1| IT| RAM|1000|
| 2| IT| SRI| 600|
| 3| HR|GOPI|1500|
| 5| HW|MAHI| 700|
+---+----+----+----+
>>> df2 = sc.parallelize([
... (1,'IT','RAM',1000),
... (2,'IT','SRI',900),
... (4,'MT','SUMP',1200),
... (5,'HW','MAHI',700)
... ]).toDF(['NO','DEPT','NAME','SAL'])
>>> df2.show()
+---+----+----+----+
| NO|DEPT|NAME| SAL|
+---+----+----+----+
| 1| IT| RAM|1000|
| 2| IT| SRI| 900|
| 4| MT|SUMP|1200|
| 5| HW|MAHI| 700|
+---+----+----+----+
#DELETE
>>> df_d = df1.join(df2, df1.NO == df2.NO, 'left').filter(F.isnull(df2.NO)).select(df1.NO,df1.DEPT,df1.NAME,df1.SAL, F.lit('D').alias('FLAG'))
#INSERT
>>> df_i = df1.join(df2, df1.NO == df2.NO, 'right').filter(F.isnull(df1.NO)).select(df2.NO,df2.DEPT,df2.NAME,df2.SAL, F.lit('I').alias('FLAG'))
#SAME/
>>> df_s = df1.join(df2, df1.NO == df2.NO, 'inner').filter(F.concat(df2.NO,df2.DEPT,df2.NAME,df2.SAL) == F.concat(df1.NO,df1.DEPT,df1.NAME,df1.SAL)).\
... select(df1.NO,df1.DEPT,df1.NAME,df1.SAL, F.lit('S').alias('FLAG'))
#UPDATE
>>> df_u = df1.join(df2, df1.NO == df2.NO, 'inner').filter(F.concat(df2.NO,df2.DEPT,df2.NAME,df2.SAL) != F.concat(df1.NO,df1.DEPT,df1.NAME,df1.SAL)).\
... select(df2.NO,df2.DEPT,df2.NAME,df2.SAL, F.lit('U').alias('FLAG'))
>>> dfs = [df_s,df_u,df_u,df_i]
>>> df = reduce(DataFrame.unionAll, dfs)
>>>
>>> df.show()
+---+----+----+----+----+
| NO|DEPT|NAME| SAL|FLAG|
+---+----+----+----+----+
| 5| HW|MAHI| 700| S|
| 1| IT| RAM|1000| S|
| 2| IT| SRI| 900| U|
| 2| IT| SRI| 900| U|
| 4| MT|SUMP|1200| I|
+---+----+----+----+----+

You can use 'outer' join after concatenating all columns first. Then create an udf for flags.
import pyspark.sql.functions as F
df = sql.createDataFrame([
(1,'IT','RAM',1000),
(2,'IT','SRI',600),
(3,'HR','GOPI',1500),
(5,'HW','MAHI',700)],
['NO' ,'DEPT', 'NAME', 'SAL' ])
df1 = sql.createDataFrame([
(1,'IT','RAM',1000),
(2,'IT','SRI',900),
(4,'MT','SUMP',1200 ),
(5,'HW','MAHI',700)],
['NO' ,'DEPT', 'NAME', 'SAL' ])
def flags(x,y):
if not x:
return y+'-I'
if not y:
return x+'-D'
if x == y:
return x+'-S'
return y+'-U'
_cols = df.columns
flag_udf = F.udf(lambda x,y: flags(x,y),StringType())
df = df.select(['NO']+ [F.concat_ws('-', *[F.col(_c) for _c in df.columns]).alias('f1')])\
.join(df1.select(['NO']+ [F.concat_ws('-', *[F.col(_c1) for _c1 in df1.columns]).alias('f2')]), 'NO', 'outer')\
.select(flag_udf('f1','f2').alias('combined'))
df.show()
The result will be,
+----------------+
| combined|
+----------------+
| 5-HW-MAHI-700-S|
| 1-IT-RAM-1000-S|
|3-HR-GOPI-1500-D|
| 2-IT-SRI-900-U|
|4-MT-SUMP-1200-I|
+----------------+
Finally, split the combined column.
split_col = F.split(df['combined'], '-')
df = df.select([split_col.getItem(i).alias(s) for i,s in enumerate(_cols+['FLAG'])])
df.show()
You get the desired output,
+---+----+----+----+----+
| NO|DEPT|NAME| SAL|FLAG|
+---+----+----+----+----+
| 5| HW|MAHI| 700| S|
| 1| IT| RAM|1000| S|
| 3| HR|GOPI|1500| D|
| 2| IT| SRI| 900| U|
| 4| MT|SUMP|1200| I|
+---+----+----+----+----+

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

What would be a Pyspark equivalent of the SQL statement NOT IN - pyspark

What would be the equivalent code in PySpark? If I have table A and Table B, and I want to select certain ID from Table A which is not in Table B, I can do the following SQL command: Select ID from Table A where ID not in (Select ID from Table B) What would be the equivalent code in PySpark?

Related

filter record in dataframe base on list of value

pypsark convert for loop to map

Merge many dataframes into one in Pyspark [non pandas df]

HI,Could you please help me resolving Issue while creating new column in Pyspark: I explained the issue as below:

Text file comparison using Spark data frames

Categories

Resources