pyspark collect_set of column outside of groupby - group-by

I am trying to use collect_set to get a list of strings of categorie_names that are NOT part of groupby.
My code is
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
sc = SparkContext("local")
sqlContext = HiveContext(sc)
df = sqlContext.createDataFrame([
("1", "cat1", "Dept1", "product1", 7),
("2", "cat2", "Dept1", "product1", 100),
("3", "cat2", "Dept1", "product2", 3),
("4", "cat1", "Dept2", "product3", 5),
], ["id", "category_name", "department_id", "product_id", "value"])
df.show()
df.groupby("department_id", "product_id")\
.agg({'value': 'sum'}) \
.show()
# .agg( F.collect_set("category_name"))\
The output is
+---+-------------+-------------+----------+-----+
| id|category_name|department_id|product_id|value|
+---+-------------+-------------+----------+-----+
| 1| cat1| Dept1| product1| 7|
| 2| cat2| Dept1| product1| 100|
| 3| cat2| Dept1| product2| 3|
| 4| cat1| Dept2| product3| 5|
+---+-------------+-------------+----------+-----+
+-------------+----------+----------+
|department_id|product_id|sum(value)|
+-------------+----------+----------+
| Dept1| product2| 3|
| Dept1| product1| 107|
| Dept2| product3| 5|
+-------------+----------+----------+
I want to have this output
+-------------+----------+----------+----------------------------+
|department_id|product_id|sum(value)| collect_list(category_name)|
+-------------+----------+----------+----------------------------+
| Dept1| product2| 3| cat2 |
| Dept1| product1| 107| cat1, cat2 |
| Dept2| product3| 5| cat1 |
+-------------+----------+----------+----------------------------+
Attempt 1
df.groupby("department_id", "product_id")\
.agg({'value': 'sum'}) \
.agg(F.collect_set("category_name")) \
.show()
I got this error:
pyspark.sql.utils.AnalysisException: "cannot resolve 'category_name'
given input columns: [department_id, product_id,
sum(value)];;\n'Aggregate [collect_set('category_name, 0, 0) AS
collect_set(category_name)#35]\n+- Aggregate [department_id#2,
product_id#3], [department_id#2, product_id#3, sum(value#4L) AS
sum(value)#24L]\n +- LogicalRDD [id#0, category_name#1,
department_id#2, product_id#3, value#4L]\n"
Attempt 2 I put category_name as part of groupby
df.groupby("category_name", "department_id", "product_id")\
.agg({'value': 'sum'}) \
.agg(F.collect_set("category_name")) \
.show()
It works but output is not correct
+--------------------------+
|collect_set(category_name)|
+--------------------------+
| [cat1, cat2]|
+--------------------------+

You can specify multiple aggregations within one agg(). The correct syntax for your case would be:
df.groupby("department_id", "product_id")\
.agg(F.sum('value'), F.collect_set("category_name"))\
.show()
#+-------------+----------+----------+--------------------------+
#|department_id|product_id|sum(value)|collect_set(category_name)|
#+-------------+----------+----------+--------------------------+
#| Dept1| product2| 3| [cat2]|
#| Dept1| product1| 107| [cat1, cat2]|
#| Dept2| product3| 5| [cat1]|
#+-------------+----------+----------+--------------------------+
Your method doesn't work, because the first .agg() works on a pyspark.sql.group.GroupedData and returns a new DataFrame. The subsequent call to agg is actually pyspark.sql.DataFrame.agg which is
shorthand for df.groupBy.agg()
So essentially the second call to agg is grouping again, which is not what you intended.

Related

Pyspark sort and get first and last

I used code belopw to sort based on one column. I am wondering how can I get the first element and last element in sorted dataframe?
group_by_dataframe
.count()
.filter("`count` >= 10")
.sort(desc("count"))
The max and min functions need to have a group to work with, to circumvent the issue, you can create a dummy column as below, then call the max and min for the maximum and minimum values.
If that's all you need, you don't really need sort here.
from pyspark.sql import functions as F
df = spark.createDataFrame([("a", 0.694), ("b", -2.669), ("a", 0.245), ("a", 0.1), ("b", 0.3), ("c", 0.3)], ["n", "val"])
df.show()
+---+------+
| n| val|
+---+------+
| a| 0.694|
| b|-2.669|
| a| 0.245|
| a| 0.1|
| b| 0.3|
| c| 0.3|
+---+------+
df = df.groupby('n').count() #.sort(F.desc('count'))
df = df.withColumn('dummy', F.lit(1))
df.show()
+---+-----+-----+
| n|count|dummy|
+---+-----+-----+
| c| 1| 1|
| b| 2| 1|
| a| 3| 1|
+---+-----+-----+
df = df.groupBy('dummy').agg(F.min('count').alias('min'), F.max('count').alias('max')).drop('dummy')
df.show()
+---+---+
|min|max|
+---+---+
| 1| 3|
+---+---+

How to subtract DataFrames using subset of columns in Apache Spark

How can I perform filter operation on Dataframe1 using Dataframe2.
I want to remove rows from DataFrame1 for below matching condition
Dataframe1.col1 = Dataframe2.col1
Dataframe1.col2 = Dataframe2.col2
My question is different than substract two dataframes because while substract we use all columns but in my question I want to use limited number of columns
join with "left_anti"
scala> df1.show
+----+-----+-----+
|col1| col2| col3|
+----+-----+-----+
| 1| one| ek|
| 2| two| dho|
| 3|three|theen|
| 4| four|chaar|
+----+-----+-----+
scala> df2.show
+----+----+-----+
|col1|col2| col3|
+----+----+-----+
| 2| two| dho|
| 4|four|chaar|
+----+----+-----+
scala> df1.join(df2, Seq("col1", "col2"), "left_anti").show
+----+-----+-----+
|col1| col2| col3|
+----+-----+-----+
| 1| one| ek|
| 3|three|theen|
+----+-----+-----+
Possible duplicate of :Spark: subtract two DataFrames if both datasets have exact same coulmns
If you want custom join condition then you can use "anti" join. Here is the pysaprk version
Creating two data frames:
Dataframe1 :
l1 = [('col1_row1', 10), ('col1_row2', 20), ('col1_row3', 30)
df1 = spark.createDataFrame(l1).toDF('col1','col2')
df1.show()
+---------+----+
| col1|col2|
+---------+----+
|col1_row1| 10|
|col1_row2| 20|
|col1_row3| 30|
+---------+----+
Dataframe2 :
l2 = [('col1_row1', 10), ('col1_row2', 20), ('col1_row4', 40)]
df2 = spark.createDataFrame(l2).toDF('col1','col2')
df2.show()
+---------+----+
| col1|col2|
+---------+----+
|col1_row1| 10|
|col1_row2| 20|
|col1_row4| 40|
+---------+----+
Using subtract api :
df_final = df1.subtract(df2)
df_final.show()
+---------+----+
| col1|col2|
+---------+----+
|col1_row3| 30|
+---------+----+
Using left_anti :
Join condition:
join_condition = [df1["col1"] == df2["col1"], df1["col2"] == df2["col2"]]
Join finally
df_final = df1.join(df2, join_condition, 'left_anti')
df_final.show()
+---------+----+
| col1|col2|
+---------+----+
|col1_row3| 30|
+---------+----+

How to add columns in pyspark dataframe dynamically

I am trying to add few columns based on input variable vIssueCols
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
vIssueCols=['jobid','locid']
vQuery1 = 'vSrcData2= vSrcData'
vWindow1 = Window.partitionBy("vKey").orderBy("vOrderBy")
for x in vIssueCols:
Query1=vQuery1+'.withColumn("'+x+'_prev",F.lag(vSrcData.'+x+').over(vWindow1))'
exec(vQuery1)
now above query will generate vQuery1 as below, and it is working, but
vSrcData2= vSrcData.withColumn("jobid_prev",F.lag(vSrcData.jobid).over(vWindow1)).withColumn("locid_prev",F.lag(vSrcData.locid).over(vWindow1))
Cant I write a query something like
vSrcData2= vSrcData.withColumn(x+"_prev",F.lag(vSrcData.x).over(vWindow1))for x in vIssueCols
and generate the columns with the loop statement. Some blog has suggested to add a udf and call that, But instead using udf I will use above executing string method.
You can build your query using reduce.
from pyspark.sql.functions import lag
from pyspark.sql.window import Window
from functools import reduce
#sample data
df = sc.parallelize([[1, 200, '1234', 'asdf'],
[1, 50, '2345', 'qwerty'],
[1, 100, '4567', 'xyz'],
[2, 300, '123', 'prem'],
[2, 10, '000', 'ankur']]).\
toDF(["vKey","vOrderBy","jobid","locid"])
df.show()
vWindow1 = Window.partitionBy("vKey").orderBy("vOrderBy")
#your existing processing
df1= df.\
withColumn("jobid_prev",lag(df.jobid).over(vWindow1)).\
withColumn("locid_prev",lag(df.locid).over(vWindow1))
df1.show()
#to-be processing
vIssueCols=['jobid','locid']
df2 = (reduce(
lambda r_df, col_name: r_df.withColumn(col_name+"_prev", lag(r_df[col_name]).over(vWindow1)),
vIssueCols,
df
))
df2.show()
Sample data:
+----+--------+-----+------+
|vKey|vOrderBy|jobid| locid|
+----+--------+-----+------+
| 1| 200| 1234| asdf|
| 1| 50| 2345|qwerty|
| 1| 100| 4567| xyz|
| 2| 300| 123| prem|
| 2| 10| 000| ankur|
+----+--------+-----+------+
Output:
+----+--------+-----+------+----------+----------+
|vKey|vOrderBy|jobid| locid|jobid_prev|locid_prev|
+----+--------+-----+------+----------+----------+
| 1| 50| 2345|qwerty| null| null|
| 1| 100| 4567| xyz| 2345| qwerty|
| 1| 200| 1234| asdf| 4567| xyz|
| 2| 10| 000| ankur| null| null|
| 2| 300| 123| prem| 000| ankur|
+----+--------+-----+------+----------+----------+
Hope this helps!

How to use Sum on groupBy result in Spark DatFrames?

Based on the following dataframe:
+---+-----+----+
| ID|Categ|Amnt|
+---+-----+----+
| 1| A| 10|
| 1| A| 5|
| 2| A| 56|
| 2| B| 13|
+---+-----+----+
I would like to obtain the sum of the column Amnt groupby ID and Categ.
+---+-----+-----+
| ID|Categ|Count|
+---+-----+-----+
| 1| A| 15 |
| 2| A| 56 |
| 2| B| 13 |
+---+-----+-----+
In SQL I would be doing something like
SELECT ID,
Categ,
SUM (Count)
FROM Table
GROUP BY ID,
Categ;
But how to do this in Scala?
I tried
DF.groupBy($"ID", $"Categ").sum("Count")
But this just changed the Count column name into sum(count) instead of actually giving me the sum of the counts.
Maybe you were summing the wrong column, but your grougBy/sum statement looks syntactically correct to me:
val df = Seq(
(1, "A", 10),
(1, "A", 5),
(2, "A", 56),
(2, "B", 13)
).toDF("ID", "Categ", "Amnt")
df.groupBy("ID", "Categ").sum("Amnt").show
// +---+-----+---------+
// | ID|Categ|sum(Amnt)|
// +---+-----+---------+
// | 1| A| 15|
// | 2| A| 56|
// | 2| B| 13|
// +---+-----+---------+
EDIT:
To alias the sum(Amnt) column (or, for multiple aggregations), wrap the aggregation expression(s) with agg. For example:
// Rename `sum(Amnt)` as `Sum`
df.groupBy("ID", "Categ").agg(sum("Amnt").as("Sum"))
// Aggregate `sum(Amnt)` and `count(Categ)`
df.groupBy("ID", "Categ").agg(sum("Amnt"), count("Categ"))

How to create new DataFrame with dict

I had one dict, like:
cMap = {"k1" : "v1", "k2" : "v1", "k3" : "v2", "k4" : "v2"}
and one DataFrame A, like:
+---+
|key|
+----
| k1|
| k2|
| k3|
| k4|
+---+
to create the DataFame above with code:
data = [('k1'),
('k2'),
('k3'),
('k4')]
A = spark.createDataFrame(data, ['key'])
I want to get the new DataFrame, like:
+---+----------+----------+
|key| v1 | v2 |
+---+----------+----------+
| k1|true |false |
| k2|true |false |
| k3|false |true |
| k4|false |true |
+---+----------+----------+
I wish to get some suggestions, thanks!
I just wanted to contribute a different and possibly easier way to solve this.
In my code I convert a dict to a pandas dataframe, which I find is much easier. Then I directly convert the pandas dataframe to spark.
data = {'visitor': ['foo', 'bar', 'jelmer'],
'A': [0, 1, 0],
'B': [1, 0, 1],
'C': [1, 0, 0]}
df = pd.DataFrame(data)
ddf = spark.createDataFrame(df)
Output:
+---+---+---+-------+
| A| B| C|visitor|
+---+---+---+-------+
| 0| 1| 1| foo|
| 1| 0| 0| bar|
| 0| 1| 0| jelmer|
+---+---+---+-------+
I just wanted to add an easy way to create DF, using pyspark
values = [("K1","true","false"),("K2","true","false")]
columns = ['Key', 'V1', 'V2']
df = spark.createDataFrame(values, columns)
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
sc = SparkContext()
spark = SQLContext(sc)
val_dict = {
'key1':val1,
'key2':val2,
'key3':val3
}
rdd = sc.parallelize([val_dict])
bu_zdf = spark.read.json(rdd)
The dictionary can be converted to dataframe and joined with other one. My piece of code,
data = sc.parallelize([(k,)+(v,) for k,v in cMap.items()]).toDF(['key','val'])
keys = sc.parallelize([('k1',),('k2',),('k3',),('k4',)]).toDF(["key"])
newDF = data.join(keys,'key').select("key",F.when(F.col("val") == "v1","True").otherwise("False").alias("v1"),F.when(F.col("val") == "v2","True").otherwise("False").alias("v2"))
>>> newDF.show()
+---+-----+-----+
|key| v1| v2|
+---+-----+-----+
| k1| True|False|
| k2| True|False|
| k3|False| True|
| k4|False| True|
+---+-----+-----+
If there are more values, you can code that when clause as a UDF and use it.
Thanks everyone for some suggestions, I figured out the other way to resolve my problem with pivot, the code is:
cMap = {"k1" : "v1", "k2" : "v1", "k3" : "v2", "k4" : "v2"}
a_cMap = [(k,)+(v,) for k,v in cMap.items()]
data = spark.createDataFrame(a_cMap, ['key','val'])
from pyspark.sql.functions import count
data = data.groupBy('key').pivot('val').agg(count('val'))
data.show()
+---+----+----+
|key| v1| v2|
+---+----+----+
| k2| 1|null|
| k4|null| 1|
| k1| 1|null|
| k3|null| 1|
+---+----+----+
data = data.na.fill(0)
data.show()
+---+---+---+
|key| v1| v2|
+---+---+---+
| k2| 1| 0|
| k4| 0| 1|
| k1| 1| 0|
| k3| 0| 1|
+---+---+---+
keys = spark.createDataFrame([('k1','2'),('k2','3'),('k3','4'),('k4','5'),('k5','6')], ["key",'temp'])
newDF = keys.join(data,'key')
newDF.show()
+---+----+---+---+
|key|temp| v1| v2|
+---+----+---+---+
| k2| 3| 1| 0|
| k4| 5| 0| 1|
| k1| 2| 1| 0|
| k3| 4| 0| 1|
+---+----+---+---+
But, I can't convert 1 to true, 0 to false.
I parallelize cMap.items() and check if value equal to v1 or v2 or not. Then joining back to dataframe A on column key
# example dataframe A
df_A = spark.sparkContext.parallelize(['k1', 'k2', 'k3', 'k4']).map(lambda x: Row(**{'key': x})).toDF()
cmap_rdd = spark.sparkContext.parallelize(cMap.items())
cmap_df = cmap_rdd.map(lambda x: Row(**dict([('key', x[0]), ('v1', x[1]=='v1'), ('v2', x[1]=='v2')]))).toDF()
df_A.join(cmap_df, on='key').orderBy('key').show()
Dataframe
+---+-----+-----+
|key| v1| v2|
+---+-----+-----+
| k1| true|false|
| k2| true|false|
| k3|false| true|
| k4|false| true|
+---+-----+-----+