Pyspark - How do I Flatten Nested Struct Column perserving parent name - pyspark

If I have a dataframe with a struct column named structA, and in it we have 3 columns named a,b and c
if I want to flat the struct I can easily do that with df.select("structA.*") and it will display
a
b
c
1
2
3
3
5
6
What I wanted is
structA.a
structA.b
structA.c
1
2
3
3
5
6
How can I do this?

I'm afraid it's not straightforward as it should. You'll need to loop through the schema to get and build your desired column names, then rename columns in a bulk. Something like this
Sample dataset
df = spark.createDataFrame([
((1, 2, 3),),
((4, 5, 6),),
], 'structA struct<a:int, b:int, c:int>')
df.show()
df.printSchema()
+---------+
| structA|
+---------+
|{1, 2, 3}|
|{4, 5, 6}|
+---------+
root
|-- structA: struct (nullable = true)
| |-- a: integer (nullable = true)
| |-- b: integer (nullable = true)
| |-- c: integer (nullable = true)
from pyspark.sql import functions as F
struct_col = 'structA'
struct_cols = [[F.col(b.name).alias(f'{a.name}_{b.name}') for b in a.dataType.fields] for a in df.schema if a.name == struct_col][0]
# [Column<'a AS structA_a'>, Column<'b AS structA_b'>, Column<'c AS structA_c'>]
df.select(f'{struct_col}.*').select(struct_cols).show()
+---------+---------+---------+
|structA_a|structA_b|structA_c|
+---------+---------+---------+
| 1| 2| 3|
| 4| 5| 6|
+---------+---------+---------+

I did the following in order to do this:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
#create dataframe ----
df_data=[((1,2,3),(-1,-2,-3)),((4,5,6),(-4,-5,-6))]
structureSchema = StructType([
StructField('structA', StructType([
StructField('a', IntegerType(), True),
StructField('b', IntegerType(), True),
StructField('c', IntegerType(), True)
])),
StructField('structB', StructType([
StructField('a', IntegerType(), True),
StructField('b', IntegerType(), True),
StructField('c', IntegerType(), True)
]))
])
df=spark.createDataFrame(df_data,structureSchema)
df.show()
+---------+------------+
| structA| structB|
+---------+------------+
|[1, 2, 3]|[-1, -2, -3]|
|[4, 5, 6]|[-4, -5, -6]|
+---------+------------+
If we have multiple struct like columns we need to find them like this:
nested_cols = [c[0] for c in df.dtypes if c[1][:6] == 'struct']
nested_cols
['structA', 'structB']
Now I will create a json like object with the new structure
struct_columns={}
for struct_column in nested_cols:
struct_columns[struct_column]=df.select(struct_column+".*").columns
struct_columns
{'structA': ['a', 'b', 'c'], 'structB': ['a', 'b', 'c']}
With the structure, I will create the flattened data frame
flatten_df=df
for key in struct_columns:
print(key)
for column in struct_columns[key]:
flatten_df=flatten_df.withColumn(key+"_"+column,F.expr(f"{key}.{column}"))
flatten_df.drop(*df.columns).show()
+---------+---------+---------+---------+---------+---------+
|structA_a|structA_b|structA_c|structB_a|structB_b|structB_c|
+---------+---------+---------+---------+---------+---------+
| 1| 2| 3| -1| -2| -3|
| 4| 5| 6| -4| -5| -6|
+---------+---------+---------+---------+---------+---------+

Related

Pyspark RDD column value selection

I have a rdd like this:
|item_id| recommendations|
+-------+------------------+
| 1|[{810, 5.2324243},{134, 4.58323},{810, 4.89248}]
| 23|[[{1643, 5.1180077}, {1463, 4.8429747}, {1368, 4.4758873}]
if I want to only extract the first value in each {} from col "recommendations".
Expected result looks like this:
|item_id| recommendations|
+-------+------------------+
| 1|[{810, 134, 810}]
| 23|[{1643, 1463, 1368}]
What should I do? Thanks!
Not sure if your data is an rdd or a dataframe, so I provide both here. Overall, from your sample data, I assume your recommendations is an array of struct type. You will know the exact columns by running df.printSchema() (if it was a dataframe) or rdd.first() (if it was an rdd). I created a dummy schema with two columns a and b.
This is my "dummy" class
class X():
def __init__(self, a, b):
self.a = a
self.b = b
This is my "dummy" data
schema = T.StructType([
T.StructField('id', T.IntegerType()),
T.StructField('rec', T.ArrayType(T.StructType([
T.StructField('a', T.IntegerType()),
T.StructField('b', T.FloatType()),
])))
])
df = spark.createDataFrame([
(1, [X(810, 5.2324243), X(134, 4.58323), X(810, 4.89248)]),
(23, [X(1643, 5.1180077), X(1463, 4.8429747), X(1368, 4.4758873)])
], schema)
If your data is a dataframe
df.show(10, False)
df.printSchema()
+---+---------------------------------------------------------+
|id |rec |
+---+---------------------------------------------------------+
|1 |[{810, 5.2324243}, {134, 4.58323}, {810, 4.89248}] |
|23 |[{1643, 5.1180077}, {1463, 4.8429747}, {1368, 4.4758873}]|
+---+---------------------------------------------------------+
root
|-- id: integer (nullable = true)
|-- rec: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- a: integer (nullable = true)
| | |-- b: float (nullable = true)
(df
.select('id', F.explode('rec').alias('rec'))
.groupBy('id')
.agg(F.collect_list('rec.a').alias('rec'))
.show()
)
+---+------------------+
| id| rec|
+---+------------------+
| 1| [810, 134, 810]|
| 23|[1643, 1463, 1368]|
+---+------------------+
If your data is an rdd
dfrdd = df.rdd
dfrdd.first()
# Row(id=1, rec=[Row(a=810, b=5.232424259185791), Row(a=134, b=4.583230018615723), Row(a=810, b=4.89247989654541)])
(dfrdd
.map(lambda x: (x.id, [r.a for r in x.rec]))
.toDF()
.show()
)
+---+------------------+
| _1| _2|
+---+------------------+
| 1| [810, 134, 810]|
| 23|[1643, 1463, 1368]|
+---+------------------+

Manually create a pyspark dataframe

I am trying to manually create a pyspark dataframe given certain data:
row_in = [(1566429545575348), (40.353977), (-111.701859)]
rdd = sc.parallelize(row_in)
schema = StructType(
[
StructField("time_epocs", DecimalType(), True),
StructField("lat", DecimalType(), True),
StructField("long", DecimalType(), True),
]
)
df_in_test = spark.createDataFrame(rdd, schema)
This gives an error when I try to display the dataframe, so I am not sure how to do this.
However, the Spark documentation seems to be a bit convoluted to me, and I got similar errors when I tried to follow those instructions.
Does anyone know how to do this?
Simple dataframe creation:
df = spark.createDataFrame(
[
(1, "foo"), # create your data here, be consistent in the types.
(2, "bar"),
],
["id", "label"] # add your column names here
)
df.printSchema()
root
|-- id: long (nullable = true)
|-- label: string (nullable = true)
df.show()
+---+-----+
| id|label|
+---+-----+
| 1| foo|
| 2| bar|
+---+-----+
According to official doc:
when schema is a list of column names, the type of each column will be inferred from data. (example above ↑)
When schema is pyspark.sql.types.DataType or a datatype string, it must match the real data. (examples below ↓)
# Example with a datatype string
df = spark.createDataFrame(
[
(1, "foo"), # Add your data here
(2, "bar"),
],
"id int, label string", # add column names and types here
)
# Example with pyspark.sql.types
from pyspark.sql import types as T
df = spark.createDataFrame(
[
(1, "foo"), # Add your data here
(2, "bar"),
],
T.StructType( # Define the whole schema within a StructType
[
T.StructField("id", T.IntegerType(), True),
T.StructField("label", T.StringType(), True),
]
),
)
df.printSchema()
root
|-- id: integer (nullable = true) # type is forced to Int
|-- label: string (nullable = true)
Additionally, you can create your dataframe from Pandas dataframe, schema will be inferred from Pandas dataframe's types :
import pandas as pd
import numpy as np
pdf = pd.DataFrame(
{
"col1": [np.random.randint(10) for x in range(10)],
"col2": [np.random.randint(100) for x in range(10)],
}
)
df = spark.createDataFrame(pdf)
df.show()
+----+----+
|col1|col2|
+----+----+
| 6| 4|
| 1| 39|
| 7| 4|
| 7| 95|
| 6| 3|
| 7| 28|
| 2| 26|
| 0| 4|
| 4| 32|
+----+----+
To elaborate/build off of #Steven's answer:
field = [
StructField("MULTIPLIER", FloatType(), True),
StructField("DESCRIPTION", StringType(), True),
]
schema = StructType(field)
multiplier_df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
Will create a blank dataframe.
We can now simply add a row to it:
l = [(2.3, "this is a sample description")]
rdd = sc.parallelize(l)
multiplier_df_temp = spark.createDataFrame(rdd, schema)
multiplier_df = wtp_multiplier_df.union(wtp_multiplier_df_temp)
This answer demonstrates how to create a PySpark DataFrame with createDataFrame, create_df and toDF.
df = spark.createDataFrame([("joe", 34), ("luisa", 22)], ["first_name", "age"])
df.show()
+----------+---+
|first_name|age|
+----------+---+
| joe| 34|
| luisa| 22|
+----------+---+
You can also pass createDataFrame a RDD and schema to construct DataFrames with more precision:
from pyspark.sql import Row
from pyspark.sql.types import *
rdd = spark.sparkContext.parallelize([
Row(name='Allie', age=2),
Row(name='Sara', age=33),
Row(name='Grace', age=31)])
schema = schema = StructType([
StructField("name", StringType(), True),
StructField("age", IntegerType(), False)])
df = spark.createDataFrame(rdd, schema)
df.show()
+-----+---+
| name|age|
+-----+---+
|Allie| 2|
| Sara| 33|
|Grace| 31|
+-----+---+
create_df from my Quinn project allows for the best of both worlds - it's concise and fully descriptive:
from pyspark.sql.types import *
from quinn.extensions import *
df = spark.create_df(
[("jose", "a"), ("li", "b"), ("sam", "c")],
[("name", StringType(), True), ("blah", StringType(), True)]
)
df.show()
+----+----+
|name|blah|
+----+----+
|jose| a|
| li| b|
| sam| c|
+----+----+
toDF doesn't offer any advantages over the other approaches:
from pyspark.sql import Row
rdd = spark.sparkContext.parallelize([
Row(name='Allie', age=2),
Row(name='Sara', age=33),
Row(name='Grace', age=31)])
df = rdd.toDF()
df.show()
+-----+---+
| name|age|
+-----+---+
|Allie| 2|
| Sara| 33|
|Grace| 31|
+-----+---+
With formatting
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(
[
(1, "foo"),
(2, "bar"),
],
StructType(
[
StructField("id", IntegerType(), False),
StructField("txt", StringType(), False),
]
),
)
print(df.dtypes)
df.show()
Extending #Steven's Answer:
data = [(i, 'foo') for i in range(1000)] # random data
columns = ['id', 'txt'] # add your columns label here
df = spark.createDataFrame(data, columns)
Note: When schema is a list of column-names, the type of each column will be inferred from data.
If you want to specifically define schema then do this:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df1 = spark.createDataFrame(data, schema)
Outputs:
>>> df1
DataFrame[id: int, txt: string]
>>> df
DataFrame[id: bigint, txt: string]
for beginners, a full example importing data from file:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
ShortType,
StringType,
StructType,
StructField,
TimestampType,
)
import os
here = os.path.abspath(os.path.dirname(__file__))
spark = SparkSession.builder.getOrCreate()
schema = StructType(
[
StructField("id", ShortType(), nullable=False),
StructField("string", StringType(), nullable=False),
StructField("datetime", TimestampType(), nullable=False),
]
)
# read file or construct rows manually
df = spark.read.csv(os.path.join(here, "data.csv"), schema=schema, header=True)

Spark select item in array by max score

Given the following DataFrame containing an id and a Seq of Stuff (with an id and score), how do I select the "best" Stuff in the array by score?
I'd like NOT to use UDFs and possibly work with Spark DataFrame functions only.
case class Stuff(id: Int, score: Double)
val df = spark.createDataFrame(Seq(
(1, Seq(Stuff(11, 0.4), Stuff(12, 0.5))),
(2, Seq(Stuff(22, 0.9), Stuff(23, 0.8)))
)).toDF("id", "data")
df.show(false)
+---+----------------------+
|id |data |
+---+----------------------+
|1 |[[11, 0.4], [12, 0.5]]|
|2 |[[22, 0.9], [23, 0.8]]|
+---+----------------------+
df.printSchema
root
|-- id: integer (nullable = false)
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = false)
| | |-- score: double (nullable = false)
I tried going down the route of window functions but the code gets a bit too convoluted. Expected output:
+---+---------+
|id |topStuff |
+---+---------
|1 |[12, 0.5]|
|2 |[22, 0.9]|
+---+---------+
You can use Spark 2.4 higher-order functions:
df
.selectExpr("id","(filter(data, x -> x.score == array_max(data.score)))[0] as topstuff")
.show()
gives
+---+---------+
| id| topstuff|
+---+---------+
| 1|[12, 0.5]|
| 2|[22, 0.9]|
+---+---------+
As an alternative, use window-functions (requires shuffling!):
df
.select($"id",explode($"data").as("topstuff"))
.withColumn("selector",max($"topstuff.score") .over(Window.partitionBy($"id")))
.where($"topstuff.score"===$"selector")
.drop($"selector")
.show()
also gives:
+---+---------+
| id| topstuff|
+---+---------+
| 1|[12, 0.5]|
| 2|[22, 0.9]|
+---+---------+

Spark scala dataframe: Merging multiple columns into single column

I have a spark dataframe which looks something like below:
+---+------+----+
| id|animal|talk|
+---+------+----+
| 1| bat|done|
| 2| mouse|mone|
| 3| horse| gun|
| 4| horse|some|
+---+------+----+
I want to generate a new column, say merged which would look something like
+---+-----------------------------------------------------------+
| id| merged columns |
+---+-----------------------------------------------------------+
| 1| [{name: animal, value: bat}, {name: talk, value: done}] |
| 2| [{name: animal, value: mouse}, {name: talk, value: mone}] |
| 3| [{name: animal, value: horse}, {name: talk, value: gun}] |
| 4| [{name: animal, value: horse}, {name: talk, value: some}] |
+---+-----------------------------------------------------------+
Basically, combining all the columns into an Array of case class merged(name:String, value: String).
Can anyone help me with how to do this in Scala?
Here for simplicity I have used only two columns but generic answer which can work for N number of columns would greatly help.
Your expected output doesn't seem to reflect your requirement of producing a list of name-value structured objects. If I understand it correctly, consider using foldLeft to iteratively convert the wanted columns to StructType name-value columns, and group them into an ArrayType column:
import org.apache.spark.sql.functions._
val df = Seq(
(1, "bat", "done"),
(2, "mouse", "mone"),
(3, "horse", "gun"),
(4, "horse", "some")
).toDF("id", "animal", "talk")
val cols = df.columns.filter(_ != "id")
val resultDF = cols.
foldLeft(df)( (accDF, c) =>
accDF.withColumn(c, struct(lit(c).as("name"), col(c).as("value")))
).
select($"id", array(cols.map(col): _*).as("merged"))
resultDF.show(false)
// +---+-----------------------------+
// |id |merged |
// +---+-----------------------------+
// |1 |[[animal,bat], [talk,done]] |
// |2 |[[animal,mouse], [talk,mone]]|
// |3 |[[animal,horse], [talk,gun]] |
// |4 |[[animal,horse], [talk,some]]|
// +---+-----------------------------+
resultDF.printSchema
// root
// |-- id: integer (nullable = false)
// |-- merged: array (nullable = false)
// | |-- element: struct (containsNull = false)
// | | |-- name: string (nullable = false)
// | | |-- value: string (nullable = true)

PySpark Dataframe from Python Dictionary without Pandas

I am trying to convert the following Python dict into PySpark DataFrame but I am not getting expected output.
dict_lst = {'letters': ['a', 'b', 'c'],
'numbers': [10, 20, 30]}
df_dict = sc.parallelize([dict_lst]).toDF() # Result not as expected
df_dict.show()
Is there a way to do this without using Pandas?
Quoting myself:
I find it's useful to think of the argument to createDataFrame() as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
So the easiest thing is to convert your dictionary into this format. You can easily do this using zip():
column_names, data = zip(*dict_lst.items())
spark.createDataFrame(zip(*data), column_names).show()
#+-------+-------+
#|letters|numbers|
#+-------+-------+
#| a| 10|
#| b| 20|
#| c| 30|
#+-------+-------+
The above assumes that all of the lists are the same length. If this is not the case, you would have to use itertools.izip_longest (python2) or itertools.zip_longest (python3).
from itertools import izip_longest as zip_longest # use this for python2
#from itertools import zip_longest # use this for python3
dict_lst = {'letters': ['a', 'b', 'c'],
'numbers': [10, 20, 30, 40]}
column_names, data = zip(*dict_lst.items())
spark.createDataFrame(zip_longest(*data), column_names).show()
#+-------+-------+
#|letters|numbers|
#+-------+-------+
#| a| 10|
#| b| 20|
#| c| 30|
#| null| 40|
#+-------+-------+
Your dict_lst is not really the format you want to adopt to create a dataframe. It would be better if you had a list of dict instead of a dict of list.
This code creates a DataFrame from you dict of list :
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
dict_lst = {'letters': ['a', 'b', 'c'],
'numbers': [10, 20, 30]}
values_lst = dict_lst.values()
nb_rows = [len(lst) for lst in values_lst]
assert min(nb_rows)==max(nb_rows) #We must have the same nb of elem for each key
row_lst = []
columns = dict_lst.keys()
for i in range(nb_rows[0]):
row_values = [lst[i] for lst in values_lst]
row_dict = {column: value for column, value in zip(columns, row_values)}
row = Row(**row_dict)
row_lst.append(row)
df = sqlContext.createDataFrame(row_lst)
Using pault's answer above I imposed a specific schema on my dataframe as follows:
import pyspark
from pyspark.sql import SparkSession, functions
spark = SparkSession.builder.appName('dictToDF').getOrCreate()
get data:
dict_lst = {'letters': ['a', 'b', 'c'],'numbers': [10, 20, 30]}
data = dict_lst.values()
create schema:
from pyspark.sql.types import *
myschema= StructType([ StructField("letters", StringType(), True)\
,StructField("numbers", IntegerType(), True)\
])
create df from dictionary - with schema:
df=spark.createDataFrame(zip(*data), schema = myschema)
df.show()
+-------+-------+
|letters|numbers|
+-------+-------+
| a| 10|
| b| 20|
| c| 30|
+-------+-------+
show df schema:
df.printSchema()
root
|-- letters: string (nullable = true)
|-- numbers: integer (nullable = true)
You can also use a Python List to quickly prototype a DataFrame. The idea is based from Databricks's tutorial.
df = spark.createDataFrame(
[(1, "a"),
(1, "a"),
(1, "b")],
("id", "value"))
df.show()
+---+-----+
| id|value|
+---+-----+
| 1| a|
| 1| a|
| 1| b|
+---+-----+
Try this out :
dict_lst = [{'letters': 'a', 'numbers': 10},
{'letters': 'b', 'numbers': 20},
{'letters': 'c', 'numbers': 30}]
df_dict = sc.parallelize(dict_lst).toDF() # Result as expected
Output:
>>> df_dict.show()
+-------+-------+
|letters|numbers|
+-------+-------+
| a| 10|
| b| 20|
| c| 30|
+-------+-------+
The most efficient approach is to use Pandas
import pandas as pd
spark.createDataFrame(pd.DataFrame(dict_lst))