Manually create a pyspark dataframe - pyspark

I am trying to manually create a pyspark dataframe given certain data:
row_in = [(1566429545575348), (40.353977), (-111.701859)]
rdd = sc.parallelize(row_in)
schema = StructType(
[
StructField("time_epocs", DecimalType(), True),
StructField("lat", DecimalType(), True),
StructField("long", DecimalType(), True),
]
)
df_in_test = spark.createDataFrame(rdd, schema)
This gives an error when I try to display the dataframe, so I am not sure how to do this.
However, the Spark documentation seems to be a bit convoluted to me, and I got similar errors when I tried to follow those instructions.
Does anyone know how to do this?

Simple dataframe creation:
df = spark.createDataFrame(
[
(1, "foo"), # create your data here, be consistent in the types.
(2, "bar"),
],
["id", "label"] # add your column names here
)
df.printSchema()
root
|-- id: long (nullable = true)
|-- label: string (nullable = true)
df.show()
+---+-----+
| id|label|
+---+-----+
| 1| foo|
| 2| bar|
+---+-----+
According to official doc:
when schema is a list of column names, the type of each column will be inferred from data. (example above ↑)
When schema is pyspark.sql.types.DataType or a datatype string, it must match the real data. (examples below ↓)
# Example with a datatype string
df = spark.createDataFrame(
[
(1, "foo"), # Add your data here
(2, "bar"),
],
"id int, label string", # add column names and types here
)
# Example with pyspark.sql.types
from pyspark.sql import types as T
df = spark.createDataFrame(
[
(1, "foo"), # Add your data here
(2, "bar"),
],
T.StructType( # Define the whole schema within a StructType
[
T.StructField("id", T.IntegerType(), True),
T.StructField("label", T.StringType(), True),
]
),
)
df.printSchema()
root
|-- id: integer (nullable = true) # type is forced to Int
|-- label: string (nullable = true)
Additionally, you can create your dataframe from Pandas dataframe, schema will be inferred from Pandas dataframe's types :
import pandas as pd
import numpy as np
pdf = pd.DataFrame(
{
"col1": [np.random.randint(10) for x in range(10)],
"col2": [np.random.randint(100) for x in range(10)],
}
)
df = spark.createDataFrame(pdf)
df.show()
+----+----+
|col1|col2|
+----+----+
| 6| 4|
| 1| 39|
| 7| 4|
| 7| 95|
| 6| 3|
| 7| 28|
| 2| 26|
| 0| 4|
| 4| 32|
+----+----+

To elaborate/build off of #Steven's answer:
field = [
StructField("MULTIPLIER", FloatType(), True),
StructField("DESCRIPTION", StringType(), True),
]
schema = StructType(field)
multiplier_df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
Will create a blank dataframe.
We can now simply add a row to it:
l = [(2.3, "this is a sample description")]
rdd = sc.parallelize(l)
multiplier_df_temp = spark.createDataFrame(rdd, schema)
multiplier_df = wtp_multiplier_df.union(wtp_multiplier_df_temp)

This answer demonstrates how to create a PySpark DataFrame with createDataFrame, create_df and toDF.
df = spark.createDataFrame([("joe", 34), ("luisa", 22)], ["first_name", "age"])
df.show()
+----------+---+
|first_name|age|
+----------+---+
| joe| 34|
| luisa| 22|
+----------+---+
You can also pass createDataFrame a RDD and schema to construct DataFrames with more precision:
from pyspark.sql import Row
from pyspark.sql.types import *
rdd = spark.sparkContext.parallelize([
Row(name='Allie', age=2),
Row(name='Sara', age=33),
Row(name='Grace', age=31)])
schema = schema = StructType([
StructField("name", StringType(), True),
StructField("age", IntegerType(), False)])
df = spark.createDataFrame(rdd, schema)
df.show()
+-----+---+
| name|age|
+-----+---+
|Allie| 2|
| Sara| 33|
|Grace| 31|
+-----+---+
create_df from my Quinn project allows for the best of both worlds - it's concise and fully descriptive:
from pyspark.sql.types import *
from quinn.extensions import *
df = spark.create_df(
[("jose", "a"), ("li", "b"), ("sam", "c")],
[("name", StringType(), True), ("blah", StringType(), True)]
)
df.show()
+----+----+
|name|blah|
+----+----+
|jose| a|
| li| b|
| sam| c|
+----+----+
toDF doesn't offer any advantages over the other approaches:
from pyspark.sql import Row
rdd = spark.sparkContext.parallelize([
Row(name='Allie', age=2),
Row(name='Sara', age=33),
Row(name='Grace', age=31)])
df = rdd.toDF()
df.show()
+-----+---+
| name|age|
+-----+---+
|Allie| 2|
| Sara| 33|
|Grace| 31|
+-----+---+

With formatting
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(
[
(1, "foo"),
(2, "bar"),
],
StructType(
[
StructField("id", IntegerType(), False),
StructField("txt", StringType(), False),
]
),
)
print(df.dtypes)
df.show()

Extending #Steven's Answer:
data = [(i, 'foo') for i in range(1000)] # random data
columns = ['id', 'txt'] # add your columns label here
df = spark.createDataFrame(data, columns)
Note: When schema is a list of column-names, the type of each column will be inferred from data.
If you want to specifically define schema then do this:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df1 = spark.createDataFrame(data, schema)
Outputs:
>>> df1
DataFrame[id: int, txt: string]
>>> df
DataFrame[id: bigint, txt: string]

for beginners, a full example importing data from file:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
ShortType,
StringType,
StructType,
StructField,
TimestampType,
)
import os
here = os.path.abspath(os.path.dirname(__file__))
spark = SparkSession.builder.getOrCreate()
schema = StructType(
[
StructField("id", ShortType(), nullable=False),
StructField("string", StringType(), nullable=False),
StructField("datetime", TimestampType(), nullable=False),
]
)
# read file or construct rows manually
df = spark.read.csv(os.path.join(here, "data.csv"), schema=schema, header=True)

Related

Pyspark - How do I Flatten Nested Struct Column perserving parent name

If I have a dataframe with a struct column named structA, and in it we have 3 columns named a,b and c
if I want to flat the struct I can easily do that with df.select("structA.*") and it will display
a
b
c
1
2
3
3
5
6
What I wanted is
structA.a
structA.b
structA.c
1
2
3
3
5
6
How can I do this?
I'm afraid it's not straightforward as it should. You'll need to loop through the schema to get and build your desired column names, then rename columns in a bulk. Something like this
Sample dataset
df = spark.createDataFrame([
((1, 2, 3),),
((4, 5, 6),),
], 'structA struct<a:int, b:int, c:int>')
df.show()
df.printSchema()
+---------+
| structA|
+---------+
|{1, 2, 3}|
|{4, 5, 6}|
+---------+
root
|-- structA: struct (nullable = true)
| |-- a: integer (nullable = true)
| |-- b: integer (nullable = true)
| |-- c: integer (nullable = true)
from pyspark.sql import functions as F
struct_col = 'structA'
struct_cols = [[F.col(b.name).alias(f'{a.name}_{b.name}') for b in a.dataType.fields] for a in df.schema if a.name == struct_col][0]
# [Column<'a AS structA_a'>, Column<'b AS structA_b'>, Column<'c AS structA_c'>]
df.select(f'{struct_col}.*').select(struct_cols).show()
+---------+---------+---------+
|structA_a|structA_b|structA_c|
+---------+---------+---------+
| 1| 2| 3|
| 4| 5| 6|
+---------+---------+---------+
I did the following in order to do this:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
#create dataframe ----
df_data=[((1,2,3),(-1,-2,-3)),((4,5,6),(-4,-5,-6))]
structureSchema = StructType([
StructField('structA', StructType([
StructField('a', IntegerType(), True),
StructField('b', IntegerType(), True),
StructField('c', IntegerType(), True)
])),
StructField('structB', StructType([
StructField('a', IntegerType(), True),
StructField('b', IntegerType(), True),
StructField('c', IntegerType(), True)
]))
])
df=spark.createDataFrame(df_data,structureSchema)
df.show()
+---------+------------+
| structA| structB|
+---------+------------+
|[1, 2, 3]|[-1, -2, -3]|
|[4, 5, 6]|[-4, -5, -6]|
+---------+------------+
If we have multiple struct like columns we need to find them like this:
nested_cols = [c[0] for c in df.dtypes if c[1][:6] == 'struct']
nested_cols
['structA', 'structB']
Now I will create a json like object with the new structure
struct_columns={}
for struct_column in nested_cols:
struct_columns[struct_column]=df.select(struct_column+".*").columns
struct_columns
{'structA': ['a', 'b', 'c'], 'structB': ['a', 'b', 'c']}
With the structure, I will create the flattened data frame
flatten_df=df
for key in struct_columns:
print(key)
for column in struct_columns[key]:
flatten_df=flatten_df.withColumn(key+"_"+column,F.expr(f"{key}.{column}"))
flatten_df.drop(*df.columns).show()
+---------+---------+---------+---------+---------+---------+
|structA_a|structA_b|structA_c|structB_a|structB_b|structB_c|
+---------+---------+---------+---------+---------+---------+
| 1| 2| 3| -1| -2| -3|
| 4| 5| 6| -4| -5| -6|
+---------+---------+---------+---------+---------+---------+

How can I convert a nested json to a specific dataframe

I have a json which looks like this:
"A":{"B":1,"C":[{"D":2,"E":3},{"D":6,"E":7}]}
I would like to create a dataframe based on this with two rows.
B
D
E
1
2
3
1
6
7
In an imperative language, I would use a for loop. However, I saw that this is not recommended with Scala and therefore I am wondering how I can explode this inner JSON and use the entries in new rows.
Thank you very much
You can use from_json to parse the string json values :
val df = Seq(
""""A":{"B":1,"C":[{"D":2,"E":3},{"D":6,"E":7}]}"""
).toDF("col")
val df1 = df.withColumn(
"col",
from_json(
regexp_extract(col("col"), "\"A\":(.*)", 1), // extract the part after "A":
lit("struct<B:int,C:array<struct<D:int,E:int>>>")
)
).select(col("col.B"), expr("inline(col.C)"))
df1.show
//+---+---+---+
//| B| D| E|
//+---+---+---+
//| 1| 2| 3|
//| 1| 6| 7|
//+---+---+---+
You can also pass the schema as StructType to from_json function which you define like this :
val schema = StructType(Array(
StructField("B", IntegerType, true),
StructField("C", ArrayType(StructType(Array(
StructField("D", IntegerType, true),
StructField("E", IntegerType, true)
))))
)
)

How to creat a pyspark DataFrame inside of a loop?

How to creat a pyspark DataFrame inside of a loop? In this loop in each iterate I am printing 2 values print(a1,a2). now I want to store all these value in a pyspark dataframe.
Initially, before the loop, you could create an empty dataframe with your preferred schema. Then, create a new df for each loop with the same schema and union it with your original dataframe. Refer the code below.
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType
spark = SparkSession.builder.getOrCreate()
schema = StructType([
StructField('a1', StringType(), True),
StructField('a2', StringType(), True)
])
df = spark.createDataFrame([],schema)
for i in range(1,5):
a1 = i
a2 = i+1
newRow = spark.createDataFrame([(a1,a2)], schema)
df = df.union(newRow)
print(df.show())
This gives me the below result where the values are appended to the df in each loop.
+---+---+
| a1| a2|
+---+---+
| 1| 2|
| 2| 3|
| 3| 4|
| 4| 5|
+---+---+

create empty array-column of given schema in Spark

Due to the fact that parquet cannt parsists empty arrays, I replaced empty arrays with null before writing a table. Now as I read the table, I want to do the opposite:
I have a DataFrame with the following schema :
|-- id: long (nullable = false)
|-- arr: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- x: double (nullable = true)
| | |-- y: double (nullable = true)
and the following content:
+---+-----------+
| id| arr|
+---+-----------+
| 1|[[1.0,2.0]]|
| 2| null|
+---+-----------+
I'd like to replace the null-array (id=2) with an empty array, i.e.
+---+-----------+
| id| arr|
+---+-----------+
| 1|[[1.0,2.0]]|
| 2| []|
+---+-----------+
I've tried:
val arrSchema = df.schema(1).dataType
df
.withColumn("arr",when($"arr".isNull,array().cast(arrSchema)).otherwise($"arr"))
.show()
which gives :
java.lang.ClassCastException: org.apache.spark.sql.types.NullType$
cannot be cast to org.apache.spark.sql.types.StructType
Edit : I don't want to "hardcode" any schema of my array column (at least not the schema of the struct) because this can vary from case to case. I can only use the schema information from df at runtime
I'm using Spark 2.1 by the way, therefore I cannot use typedLit
Spark 2.2+ with known external type
In general you can use typedLit to provide empty arrays.
import org.apache.spark.sql.functions.typedLit
typedLit(Seq.empty[(Double, Double)])
To use specific names for nested objects you can use case classes:
case class Item(x: Double, y: Double)
typedLit(Seq.empty[Item])
or rename by cast:
typedLit(Seq.empty[(Double, Double)])
.cast("array<struct<x: Double, y: Double>>")
Spark 2.1+ with schema only
With schema only you can try:
val schema = StructType(Seq(
StructField("arr", StructType(Seq(
StructField("x", DoubleType),
StructField("y", DoubleType)
)))
))
def arrayOfSchema(schema: StructType) =
from_json(lit("""{"arr": []}"""), schema)("arr")
arrayOfSchema(schema).alias("arr")
where schema can be extracted from the existing DataFrame and wrapped with additional StructType:
StructType(Seq(
StructField("arr", df.schema("arr").dataType)
))
One way is the use a UDF :
val arrSchema = df.schema(1).dataType // ArrayType(StructType(StructField(x,DoubleType,true), StructField(y,DoubleType,true)),true)
val emptyArr = udf(() => Seq.empty[Any],arrSchema)
df
.withColumn("arr",when($"arr".isNull,emptyArr()).otherwise($"arr"))
.show()
+---+-----------+
| id| arr|
+---+-----------+
| 1|[[1.0,2.0]]|
| 2| []|
+---+-----------+
Another approach would be to use coalesce:
val df = Seq(
(Some(1), Some(Array((1.0, 2.0)))),
(Some(2), None)
).toDF("id", "arr")
df.withColumn("arr", coalesce($"arr", typedLit(Array.empty[(Double, Double)]))).
show
// +---+-----------+
// | id| arr|
// +---+-----------+
// | 1|[[1.0,2.0]]|
// | 2| []|
// +---+-----------+
UDF with case class could also be interesting:
case class Item(x: Double, y: Double)
val udf_emptyArr = udf(() => Seq[Item]())
df
.withColumn("arr",coalesce($"arr",udf_emptyArr()))
.show()

Join/unfolded mapType column in spark back with the original dataframe

I have a dataframe in (py)Spark, where 1 of the columns is from the type 'map'. That column I want to flatten or split into multiple columns which should be added to the original dataframe. I'm able to unfold the column with flatMap, however I loose the key to join the new dataframe (from the unfolded column) with the original dataframe.
My schema is like this:
rroot
|-- key: string (nullable = true)
|-- metric: map (nullable = false)
| |-- key: string
| |-- value: float (valueContainsNull = true)
As you can see, the column 'metric' is a map-field. This is the column that I want to flatten. Before flattening it looks like:
+----+---------------------------------------------------+
|key |metric |
+----+---------------------------------------------------+
|123k|Map(metric1 -> 1.3, metric2 -> 6.3, metric3 -> 7.6)|
|d23d|Map(metric1 -> 1.5, metric2 -> 2.0, metric3 -> 2.2)|
|as3d|Map(metric1 -> 2.2, metric2 -> 4.3, metric3 -> 9.0)|
+----+---------------------------------------------------+
To convert that field to columns I do
df2.select('metric').rdd.flatMap(lambda x: x).toDF().show()
which gives
+------------------+-----------------+-----------------+
| metric1| metric2| metric3|
+------------------+-----------------+-----------------+
|1.2999999523162842|6.300000190734863|7.599999904632568|
| 1.5| 2.0|2.200000047683716|
| 2.200000047683716|4.300000190734863| 9.0|
+------------------+-----------------+-----------------+
However I don't see the key , therefore I don't know how to add this data to the original dataframe.
What I want is:
+----+-------+-------+-------+
| key|metric1|metric2|metric3|
+----+-------+-------+-------+
|123k| 1.3| 6.3| 7.6|
|d23d| 1.5| 2.0| 2.2|
|as3d| 2.2| 4.3| 9.0|
+----+-------+-------+-------+
My question thus is: How can i get df2 back to df (given that i originally don't know df and only have df2)
To make df2:
rdd = sc.parallelize([('123k', 1.3, 6.3, 7.6),
('d23d', 1.5, 2.0, 2.2),
('as3d', 2.2, 4.3, 9.0)
])
schema = StructType([StructField('key', StringType(), True),
StructField('metric1', FloatType(), True),
StructField('metric2', FloatType(), True),
StructField('metric3', FloatType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
from pyspark.sql.functions import lit, col, create_map
from itertools import chain
metric = create_map(list(chain(*(
(lit(name), col(name)) for name in df.columns if "metric" in name
)))).alias("metric")
df2 = df.select("key", metric)
from pyspark.sql.functions import explode
# fetch column names of the original dataframe from keys of MapType 'metric' column
col_names = df2.select(explode("metric")).select("key").distinct().sort("key").rdd.flatMap(lambda x: x).collect()
exprs = [col("key")] + [col("metric").getItem(k).alias(k) for k in col_names]
df2_to_original_df = df2.select(*exprs)
df2_to_original_df.show()
Output is:
+----+-------+-------+-------+
| key|metric1|metric2|metric3|
+----+-------+-------+-------+
|123k| 1.3| 6.3| 7.6|
|d23d| 1.5| 2.0| 2.2|
|as3d| 2.2| 4.3| 9.0|
+----+-------+-------+-------+
I can select a certain key from a maptype by doing:
df.select('maptypecolumn'.'key')
In my example I did it as follows:
columns= df2.select('metric').rdd.flatMap(lambda x: x).toDF().columns
for i in columns:
df2= df2.withColumn(i,lit(df2.metric[i]))
You can access key and value for example like this:
from pyspark.sql.functions import explode
df.select(explode("custom_dimensions")).select("key")