How to unpivot pyspark dataframe? - pyspark

How can i transform the first dataframe into the second dataframe as shown in the image with pyspark? I did some research and i want something like the function unpivot. The column "Ano" is the year.
At the moment i have this code based on this link Unpivot in Spark SQL / PySpark :
from pyspark.sql.types import StructType,StructField,StringType,FloatType
from pyspark.sql.functions import when
data = [("High Paper Grade", 0.007,0.005,0.001,0.026,0.013,0.009,0.001,0.059,"Paper"), ("Mixed Low Grade Paper", 0.087,0.068,0.024,0.314,0.093,0.074,0.024,0.319,"Paper")]
schema = StructType([ \
StructField("Material",StringType(),True), \
StructField("2017_Aggregate",FloatType(),True), \
StructField("2017_Refuse",FloatType(),True), \
StructField("2017_MGP", FloatType(), True), \
StructField("2017_Paper", FloatType(), True), \
StructField("2013_Aggregate", FloatType(), True), \
StructField("2013_Refuse", FloatType(), True), \
StructField("2013_MGP", FloatType(), True), \
StructField("2013_Paper", FloatType(), True), \
StructField("Material_Group", StringType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df=df.selectExpr("Material","Material_Group","stack(8,'2017_Aggregate',2017_Aggregate,'2017_Refuse',2017_Refuse,'2017_MGP',2017_MGP,'2017_Paper',2017_Paper,'2013_Aggregate',2013_Aggregate,'2013_Refuse', 2013_Refuse,'2013_MGP',2013_MGP,'2013_Paper',2013_Paper) as (Year,Value)").where("Value is not null").show()
df = df.withColumn("Year", when(df.Year == "2017_Aggregate", "2017")
.when (df.Year == "2017_Aggregate", "2017")
.when (df.Year == "2017_Refuse", "2017")
.when (df.Year == "2017_MGP", "2017")
.when (df.Year == "2017_Paper", "2017")
.when (df.Year == "2013_Aggregate", "2013")
.when (df.Year == "2013_Refuse", "2013")
.when (df.Year == "2013_MGP", "2013")
.when (df.Year == "2013_Paper", "2013")
.otherwise(df.Year)
)
df.toPandas()
Which gives me the following output:
Output
This is what i am trying to achieve:

Related

How to read decimal numbers in csv file with spark?

I am trying to read a file which contains numbers with decimals and when I read the csv file with spark, I get null for some columns and some few digits for other columns. I guess it has to do with the option adjustments I make during the spar.read. Here is my code
from pyspark.sql.types import DateType, DecimalType, DecimalType, StringType, StructField,
StructType
schema = StructType([
StructField("Date", DateType(), False),
StructField("Total MV", DecimalType(16,5), False),
StructField("Total TWR", DecimalType(16,5), False),
StructField("Prod1 MV", DecimalType(16,5), False),
StructField("Prod1TWR", DecimalType(16,5), False),
StructField("Prod2 MV", DecimalType(16,5), False),
StructField("Prod2TWR", DecimalType(16,5), False),
StructField("StockAll", DecimalType(16,5), False)
])
df_mr = (spark.read
.option("delimiter", ";")
.option("inferSchema", True)
.csv("here is the link of the file", loclae="sv_SE")
df_mr.schema
df = (
spark.read
.option("delimiter", ";")
.schema(schema)
.csv("here is the link to the file", locale="sv_SE")
)
df.createOrReplaceTempView("output")
df.show()
The out I get is the following output and then when i use SQL
%sql
select * from output
to get the table, i get the following SQL table. I don't understand why I get null and number formats that are different from the first image. The sample data as input is indata.
I tried with your sample data
DecimalType(20,5) will allow 20 digits before decimal to visible and adding .option("header", True) will convert first line as header and will remove the null values which are getting created.
from pyspark.sql.types import DateType, DecimalType, DecimalType, StringType, StructField, StructType
from pyspark.sql.functions import col, unix_timestamp, to_date
from pyspark.sql import functions as F
schema = StructType([
StructField("Date", DateType(), False),
StructField("Total MV", DecimalType(20,5), False),
StructField("Total TWR", DecimalType(20,5), False),
StructField("Prod1 MV", DecimalType(20,5), False),
StructField("Prod1TWR", DecimalType(20,5), False),
StructField("Prod2 MV", DecimalType(20,5), False),
StructField("Prod2TWR", DecimalType(20,5), False),
StructField("StockAll", DecimalType(20,5), False)
])
df_mr = (spark.read
.option("delimiter", ";")
.option("header", True)
.option("inferSchema", True)
.csv("/FileStore/tables/samplecsvdatadeci.txt"))
df_mr.printSchema()
df_mr.show()
df = (
spark.read
.option("delimiter", ";")
.option("header", True)
.schema(schema)
.csv("/FileStore/tables/samplecsvdatadeci.txt")
)
df.printSchema()
df.show()
df.createOrReplaceTempView("output")
%sql
select * from output
Output

Create a column for each struct in an array of struct's in a PySpark DataFrame

Suppose I have a dataframe where a have an id and a distinct list of keys and values, such as the following:
import pyspark.sql.functions as fun
from pyspark.sql.types import StructType, StructField, ArrayType, IntegerType, StringType
schema = StructType(
[
StructField('id', StringType(), True),
StructField('columns',
ArrayType(
StructType([
StructField('key', StringType(), True),
StructField('value', IntegerType(), True)
])
)
)
]
)
data = [
('1', [('Savas', 5)]),
('2', [('Savas', 5), ('Ali', 3)]),
('3', [('Savas', 5), ('Ali', 3), ('Ozdemir', 7)])
]
df = spark.createDataFrame(data, schema)
df.show()
For each struct in the array type column I want create a column, as follows:
df1 = df\
.withColumn('temp', fun.explode('names'))\
.select('id', 'temp.key', 'temp.value')\
.groupby('id')\
.pivot('key')\
.agg(fun.first(fun.col('value')))\
.sort('user_id')
df1.show()
Is there a more efficient way to achieve the same result?

Extract field(s) and value from nested json in PySpark dataframe and Sort it based off of value

How I can write a function in databricks/spark which will take email or md5 value of email and Mon as input and return top 5 cities sorted by activityCount in Dict format (if it doesn't have 3 cities then return however many matches are found).
PS: there's more columns in df for other days as well such as "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and they'll have data in similar format in them but I've only added "Mon" for brevity.
Dataframe
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|email |Mon |
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|aaaa#aol.com|{[California]={"[San Francisco]":{"activityCount":4}}, {"[San Diego]":{"activityCount":5}}, {"[San Jose]":{"activityCount":6}}, [New York]={"[New York City]":{"activityCount":1}}, {"[Fairport]":{"activityCount":2}}, {"[Manhattan]":{"activityCount":3}}}|
|bbbb#aol.com|{[Alberta]={"[city1]":{"activityCount":1}}, {"[city2]":{"activityCount":2}}, {"[city3]":{"activityCount":3}}, [New York]={"[New York City]":{"activityCount":7}}, {"[Fairport]":{"activityCount":8}}, {"[Manhattan]":{"activityCount":9}}}|
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
dataframe_schema is as following:
schema = StructType([
StructField("email", StringType(), True),
StructField("Mon", StringType(), False)
])
Sample code to set it up
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType
if __name__ == "__main__":
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
data2 = [("aaaa#aol.com",
{
"[New York]": "{\"[New York City]\":{\"activityCount\":1}}, {\"[Fairport]\":{\"activityCount\":2}}, "
"{\"[Manhattan]\":{\"activityCount\":3}}",
"[California]": "{\"[San Francisco]\":{\"activityCount\":4}}, {\"[San Diego]\":{\"activityCount\":5}}, "
"{\"[San Jose]\":{\"activityCount\":6}}"
}
)]
schema = StructType([
StructField("email", StringType(), True),
StructField("Mon", StringType(), False)
])
task5DF = spark.createDataFrame(data=data2, schema=schema)
task5DF.show(truncate=False)

Apache pyspark pandas

I am new in apache spark. I create the schema and data frame and it show me result but the format was not good and it so messy. Hardly I can read the line. So i want to show my result in pandas format. I attached the screen shot of my data frame result. But i don't know how to show my result in pandas format.
Here's my code
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.types import *
from IPython.display import display
import pandas as pd
import gzip
schema = StructType([StructField("crimeid", StringType(), True),
StructField("Month", StringType(), True),
StructField("Reported_by", StringType(), True),
StructField("Falls_within", StringType(), True),
StructField("Longitude", FloatType(), True),
StructField("Latitue", FloatType(), True),
StructField("Location", StringType(), True),
StructField("LSOA_code", StringType(), True),
StructField("LSOA_name", StringType(), True),
StructField("Crime_type", StringType(), True),
StructField("Outcome_type", StringType(), True),
])
df = spark.read.csv("crimes.gz",header=False,schema=schema)
df.printSchema()
PATH = "crimes.gz"
csvfile = spark.read.format("csv")\
.option("header", "false")\
.schema(schema)\
.load(PATH)
df1 =csvfile.show()
it shows the result like below
but in want this data pandas form
Thanks
You can try showing them vertically per row, or truncate big names if you like:
df.show(2, vertical=True)
df.show(2, truncate=4, vertical=True)
Please try:
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.types import *
from IPython.display import display
import pandas as pd
import gzip
schema = StructType([StructField("crimeid", StringType(), True),
StructField("Month", StringType(), True),
StructField("Reported_by", StringType(), True),
StructField("Falls_within", StringType(), True),
StructField("Longitude", FloatType(), True),
StructField("Latitue", FloatType(), True),
StructField("Location", StringType(), True),
StructField("LSOA_code", StringType(), True),
StructField("LSOA_name", StringType(), True),
StructField("Crime_type", StringType(), True),
StructField("Outcome_type", StringType(), True),
])
df = spark.read.csv("crimes.gz",header=False,schema=schema)
df.printSchema()
pandasDF = df.toPandas() # transform PySpark dataframe in Pandas dataframe
print(pandasDF.head()) # print 5 first rows

Pyspark on Spark 2.1.1, StructFields in StructType are always nullable

I'm creating a StructType using several StructFields -- the name and datatype seem to work fine, but regardless of setting nullable to False in each StructField the resulting schema reports nullable is True for each StructField.
Can anyone explain why? THANKS!
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, FloatType, TimestampType
sparkSession = SparkSession.builder \
.master("local") \
.appName("SparkSession") \
.getOrCreate()
dfStruct = StructType().add("date", TimestampType(), False)
dfStruct.add("open", FloatType(), False)
dfStruct.add("high", FloatType(), False)
dfStruct.add("low", FloatType(), False)
dfStruct.add("close", FloatType(), False)
dfStruct.add("ticker", StringType(), False)
#print elements of StructType -- reports nullable is false
for d in dfStruct: print d
#data looks like this:
#date,open,high,low,close,ticker
# 2014-10-14 23:20:32,7.14,9.07,0.0,7.11,ARAY
# 2014-10-14 23:20:36,9.74,10.72,6.38,9.25,ARC
# 2014-10-14 23:20:38,31.38,37.0,28.0,30.94,ARCB
# 2014-10-14 23:20:44,15.39,17.37,15.35,15.3,ARCC
# 2014-10-14 23:20:49,5.59,6.5,5.31,5.48,ARCO
#read csv file and apply dfStruct as the schema
df = sparkSession.read.csv(path = "/<path tofile>/stock_data.csv", \
schema = dfStruct, \
sep = ",", \
ignoreLeadingWhiteSpace = True, \
ignoreTrailingWhiteSpace = True \
)
#reports nullable as True!
df.printSchema()
This is a known issue in Spark. There is currently an open pull request in Spark aiming to fix this issue. If you really need your fields to be non-nullable, try:
#read csv file and apply dfStruct as the schema
df = sparkSession.read.csv(path = "/<path tofile>/stock_data.csv", \
schema = dfStruct, \
sep = ",", \
ignoreLeadingWhiteSpace = True, \
ignoreTrailingWhiteSpace = True \
).rdd.toDF(dfStruct)