how to compliment a df from another df values - pyspark

i have 2 dataframes, so one df has unique values with a good format, and the other df has values with wrong values, so how can i compliment the df with wrong values with respect to the other dataframe?.
Example: df with correct and unique values
+----------------------------------------+--------------+
|company_id |company_name |
+----------------------------------------+--------------+
|8f642dc67fccf861548dfe1c761ce22f795e91f0|Muebles |
|cbf1c8b09cd5b549416d49d220a40cbd317f952e|MiPasajefy |
+----------------------------------------+--------------+
Example df with wrong values:
+----------------------------------------+------------+
|company_id |company_name|
+----------------------------------------+------------+
|******* |MiPasajefy |
|cbf1c8b09cd5b549416d49d220a40cbd317f952e|NaN |
|NaN |MiPasajefy |
+----------------------------------------+------------+
The columns : company_id and company_name are key columns,
so the wrong df with corrected values has to be:
+----------------------------------------+------------+
|company_id |company_name|
+----------------------------------------+------------+
|cbf1c8b09cd5b549416d49d220a40cbd317f952e|MiPasajefy |
|cbf1c8b09cd5b549416d49d220a40cbd317f952e|MiPasajefy |
|cbf1c8b09cd5b549416d49d220a40cbd317f952e|MiPasajefy |
+----------------------------------------+------------+

from datetime import datetime
from pyspark.sql import *
from collections import *
from pyspark.sql.functions import udf,explode
from pyspark.sql.types import StringType
table_schema = StructType([StructField('key1', StringType(), True),
StructField('key2', IntegerType(), True),
StructField('list1', ArrayType(StringType()), False),
StructField('list2', ArrayType(StringType()), False),
StructField('list3', ArrayType(IntegerType()), False),
StructField('list4', StringType(), False),
StructField('list5', ArrayType(FloatType()), False),
StructField('list6', ArrayType(StringType()), False)
])
df= spark.createDataFrame(
[
("8f642dc67fccf861548dfe1c761ce22f795e91f0","Muebles"),
("cbf1c8b09cd5b549416d49d220a40cbd317f952e","MiPasajefy")
],("company_id","company_name")
)
df2= spark.createDataFrame(
[
( "*****" ,"MiPasajefy" ),
("cbf1c8b09cd5b549416d49d220a40cbd317f952e","NaN" ),
("NaN","MiPasajefy")
],("company_id","company_name")
)
df.createOrReplaceTempView("A")
df2.createOrReplaceTempView("B")
spark.sql("select a.Company_name,a.company_id from B b left join A a on (a.company_id=b.company_id or a.Company_name=b.Company_name )").show(truncate=False)
+------------+----------------------------------------+
|Company_name|company_id |
+------------+----------------------------------------+
|MiPasajefy |cbf1c8b09cd5b549416d49d220a40cbd317f952e|
|MiPasajefy |cbf1c8b09cd5b549416d49d220a40cbd317f952e|
|MiPasajefy |cbf1c8b09cd5b549416d49d220a40cbd317f952e|
+------------+----------------------------------------+

Related

Create a column for each struct in an array of struct's in a PySpark DataFrame

Suppose I have a dataframe where a have an id and a distinct list of keys and values, such as the following:
import pyspark.sql.functions as fun
from pyspark.sql.types import StructType, StructField, ArrayType, IntegerType, StringType
schema = StructType(
[
StructField('id', StringType(), True),
StructField('columns',
ArrayType(
StructType([
StructField('key', StringType(), True),
StructField('value', IntegerType(), True)
])
)
)
]
)
data = [
('1', [('Savas', 5)]),
('2', [('Savas', 5), ('Ali', 3)]),
('3', [('Savas', 5), ('Ali', 3), ('Ozdemir', 7)])
]
df = spark.createDataFrame(data, schema)
df.show()
For each struct in the array type column I want create a column, as follows:
df1 = df\
.withColumn('temp', fun.explode('names'))\
.select('id', 'temp.key', 'temp.value')\
.groupby('id')\
.pivot('key')\
.agg(fun.first(fun.col('value')))\
.sort('user_id')
df1.show()
Is there a more efficient way to achieve the same result?

How to extract data from a column which has json type strings in pyspark?

Hi I have a dataframe
client_id| event_metadata |
+---------+-----------------------------------------------------
| 18890 |{Scripname:"DELL", Exchange: "NSE", Segment: "EQ" } |
| 10531 |{Scripname:"NAUKRI", Exchange: "NSE", Segment: "EQ" }|
I want to extract event_metadata and store only ScripName along with client_id as a dataframe.
event_metadata is String and not json.
I have tried
from pyspark.sql import functions as F
df1.select('client_id', F.json_tuple('event_metadata', 'Scripname',
'Exchange','Segment').alias('Scripname',
'Exchange','Segment')).show()
Its returning Null values
I have also tried using regex but showing error
from pyspark.sql.functions import regexp_extract
df1.withColumn("event_metadata", regexp_extract("event_metadata", "(?
<=Scripname: )\w+(?=(,|}))", 0))\
.show(truncate=False)
Desired Output:
client_id| Scripname|
+--------+-----------
| 18890 | DELL |
| 10531 | NAUKRI |
Try this-
regexp_extract
df2.withColumn("Scripname",
regexp_extract($"event_metadata", "^\\{\\s*Scripname\\s*:\\s*\"(\\w+)\"", 1)
)
.show(false)
df2.withColumn("Scripname",
expr("""regexp_extract(event_metadata, '^\\{\\s*Scripname\\s*:\\s*"(\\w+)"', 1)""")
)
.show(false)
/**
* +---------+-----------------------------------------------------+---------+
* |client_id|event_metadata |Scripname|
* +---------+-----------------------------------------------------+---------+
* |18890 |{Scripname:"DELL", Exchange: "NSE", Segment: "EQ" } |DELL |
* |10531 |{Scripname:"NAUKRI", Exchange: "NSE", Segment: "EQ" }|NAUKRI |
* +---------+-----------------------------------------------------+---------+
*/
Define your schema properly and read the data by from_json.
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType([StructField('Scripname', StringType(), True), StructField('Exchange', StringType(), True), StructField('Segment', StringType(), True)])
df.withColumn('from_json', f.from_json('event_metadata', schema)) \
.show(10, False)
+---------+-----------------------------------------------------------+-----------------+
|client_id|event_metadata |from_json |
+---------+-----------------------------------------------------------+-----------------+
|18890 |{"Scripname": "DELL", "Exchange": "NSE", "Segment": "EQ"} |[DELL, NSE, EQ] |
|10531 |{"Scripname": "NAUKRI", "Exchange": "NSE", "Segment": "EQ"}|[NAUKRI, NSE, EQ]|
+---------+-----------------------------------------------------------+-----------------+
Now, your from_json column is struct type and can select the elements of them by col('from_json.Scripname').

Manually create a pyspark dataframe

I am trying to manually create a pyspark dataframe given certain data:
row_in = [(1566429545575348), (40.353977), (-111.701859)]
rdd = sc.parallelize(row_in)
schema = StructType(
[
StructField("time_epocs", DecimalType(), True),
StructField("lat", DecimalType(), True),
StructField("long", DecimalType(), True),
]
)
df_in_test = spark.createDataFrame(rdd, schema)
This gives an error when I try to display the dataframe, so I am not sure how to do this.
However, the Spark documentation seems to be a bit convoluted to me, and I got similar errors when I tried to follow those instructions.
Does anyone know how to do this?
Simple dataframe creation:
df = spark.createDataFrame(
[
(1, "foo"), # create your data here, be consistent in the types.
(2, "bar"),
],
["id", "label"] # add your column names here
)
df.printSchema()
root
|-- id: long (nullable = true)
|-- label: string (nullable = true)
df.show()
+---+-----+
| id|label|
+---+-----+
| 1| foo|
| 2| bar|
+---+-----+
According to official doc:
when schema is a list of column names, the type of each column will be inferred from data. (example above ↑)
When schema is pyspark.sql.types.DataType or a datatype string, it must match the real data. (examples below ↓)
# Example with a datatype string
df = spark.createDataFrame(
[
(1, "foo"), # Add your data here
(2, "bar"),
],
"id int, label string", # add column names and types here
)
# Example with pyspark.sql.types
from pyspark.sql import types as T
df = spark.createDataFrame(
[
(1, "foo"), # Add your data here
(2, "bar"),
],
T.StructType( # Define the whole schema within a StructType
[
T.StructField("id", T.IntegerType(), True),
T.StructField("label", T.StringType(), True),
]
),
)
df.printSchema()
root
|-- id: integer (nullable = true) # type is forced to Int
|-- label: string (nullable = true)
Additionally, you can create your dataframe from Pandas dataframe, schema will be inferred from Pandas dataframe's types :
import pandas as pd
import numpy as np
pdf = pd.DataFrame(
{
"col1": [np.random.randint(10) for x in range(10)],
"col2": [np.random.randint(100) for x in range(10)],
}
)
df = spark.createDataFrame(pdf)
df.show()
+----+----+
|col1|col2|
+----+----+
| 6| 4|
| 1| 39|
| 7| 4|
| 7| 95|
| 6| 3|
| 7| 28|
| 2| 26|
| 0| 4|
| 4| 32|
+----+----+
To elaborate/build off of #Steven's answer:
field = [
StructField("MULTIPLIER", FloatType(), True),
StructField("DESCRIPTION", StringType(), True),
]
schema = StructType(field)
multiplier_df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
Will create a blank dataframe.
We can now simply add a row to it:
l = [(2.3, "this is a sample description")]
rdd = sc.parallelize(l)
multiplier_df_temp = spark.createDataFrame(rdd, schema)
multiplier_df = wtp_multiplier_df.union(wtp_multiplier_df_temp)
This answer demonstrates how to create a PySpark DataFrame with createDataFrame, create_df and toDF.
df = spark.createDataFrame([("joe", 34), ("luisa", 22)], ["first_name", "age"])
df.show()
+----------+---+
|first_name|age|
+----------+---+
| joe| 34|
| luisa| 22|
+----------+---+
You can also pass createDataFrame a RDD and schema to construct DataFrames with more precision:
from pyspark.sql import Row
from pyspark.sql.types import *
rdd = spark.sparkContext.parallelize([
Row(name='Allie', age=2),
Row(name='Sara', age=33),
Row(name='Grace', age=31)])
schema = schema = StructType([
StructField("name", StringType(), True),
StructField("age", IntegerType(), False)])
df = spark.createDataFrame(rdd, schema)
df.show()
+-----+---+
| name|age|
+-----+---+
|Allie| 2|
| Sara| 33|
|Grace| 31|
+-----+---+
create_df from my Quinn project allows for the best of both worlds - it's concise and fully descriptive:
from pyspark.sql.types import *
from quinn.extensions import *
df = spark.create_df(
[("jose", "a"), ("li", "b"), ("sam", "c")],
[("name", StringType(), True), ("blah", StringType(), True)]
)
df.show()
+----+----+
|name|blah|
+----+----+
|jose| a|
| li| b|
| sam| c|
+----+----+
toDF doesn't offer any advantages over the other approaches:
from pyspark.sql import Row
rdd = spark.sparkContext.parallelize([
Row(name='Allie', age=2),
Row(name='Sara', age=33),
Row(name='Grace', age=31)])
df = rdd.toDF()
df.show()
+-----+---+
| name|age|
+-----+---+
|Allie| 2|
| Sara| 33|
|Grace| 31|
+-----+---+
With formatting
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(
[
(1, "foo"),
(2, "bar"),
],
StructType(
[
StructField("id", IntegerType(), False),
StructField("txt", StringType(), False),
]
),
)
print(df.dtypes)
df.show()
Extending #Steven's Answer:
data = [(i, 'foo') for i in range(1000)] # random data
columns = ['id', 'txt'] # add your columns label here
df = spark.createDataFrame(data, columns)
Note: When schema is a list of column-names, the type of each column will be inferred from data.
If you want to specifically define schema then do this:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df1 = spark.createDataFrame(data, schema)
Outputs:
>>> df1
DataFrame[id: int, txt: string]
>>> df
DataFrame[id: bigint, txt: string]
for beginners, a full example importing data from file:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
ShortType,
StringType,
StructType,
StructField,
TimestampType,
)
import os
here = os.path.abspath(os.path.dirname(__file__))
spark = SparkSession.builder.getOrCreate()
schema = StructType(
[
StructField("id", ShortType(), nullable=False),
StructField("string", StringType(), nullable=False),
StructField("datetime", TimestampType(), nullable=False),
]
)
# read file or construct rows manually
df = spark.read.csv(os.path.join(here, "data.csv"), schema=schema, header=True)

Join/unfolded mapType column in spark back with the original dataframe

I have a dataframe in (py)Spark, where 1 of the columns is from the type 'map'. That column I want to flatten or split into multiple columns which should be added to the original dataframe. I'm able to unfold the column with flatMap, however I loose the key to join the new dataframe (from the unfolded column) with the original dataframe.
My schema is like this:
rroot
|-- key: string (nullable = true)
|-- metric: map (nullable = false)
| |-- key: string
| |-- value: float (valueContainsNull = true)
As you can see, the column 'metric' is a map-field. This is the column that I want to flatten. Before flattening it looks like:
+----+---------------------------------------------------+
|key |metric |
+----+---------------------------------------------------+
|123k|Map(metric1 -> 1.3, metric2 -> 6.3, metric3 -> 7.6)|
|d23d|Map(metric1 -> 1.5, metric2 -> 2.0, metric3 -> 2.2)|
|as3d|Map(metric1 -> 2.2, metric2 -> 4.3, metric3 -> 9.0)|
+----+---------------------------------------------------+
To convert that field to columns I do
df2.select('metric').rdd.flatMap(lambda x: x).toDF().show()
which gives
+------------------+-----------------+-----------------+
| metric1| metric2| metric3|
+------------------+-----------------+-----------------+
|1.2999999523162842|6.300000190734863|7.599999904632568|
| 1.5| 2.0|2.200000047683716|
| 2.200000047683716|4.300000190734863| 9.0|
+------------------+-----------------+-----------------+
However I don't see the key , therefore I don't know how to add this data to the original dataframe.
What I want is:
+----+-------+-------+-------+
| key|metric1|metric2|metric3|
+----+-------+-------+-------+
|123k| 1.3| 6.3| 7.6|
|d23d| 1.5| 2.0| 2.2|
|as3d| 2.2| 4.3| 9.0|
+----+-------+-------+-------+
My question thus is: How can i get df2 back to df (given that i originally don't know df and only have df2)
To make df2:
rdd = sc.parallelize([('123k', 1.3, 6.3, 7.6),
('d23d', 1.5, 2.0, 2.2),
('as3d', 2.2, 4.3, 9.0)
])
schema = StructType([StructField('key', StringType(), True),
StructField('metric1', FloatType(), True),
StructField('metric2', FloatType(), True),
StructField('metric3', FloatType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
from pyspark.sql.functions import lit, col, create_map
from itertools import chain
metric = create_map(list(chain(*(
(lit(name), col(name)) for name in df.columns if "metric" in name
)))).alias("metric")
df2 = df.select("key", metric)
from pyspark.sql.functions import explode
# fetch column names of the original dataframe from keys of MapType 'metric' column
col_names = df2.select(explode("metric")).select("key").distinct().sort("key").rdd.flatMap(lambda x: x).collect()
exprs = [col("key")] + [col("metric").getItem(k).alias(k) for k in col_names]
df2_to_original_df = df2.select(*exprs)
df2_to_original_df.show()
Output is:
+----+-------+-------+-------+
| key|metric1|metric2|metric3|
+----+-------+-------+-------+
|123k| 1.3| 6.3| 7.6|
|d23d| 1.5| 2.0| 2.2|
|as3d| 2.2| 4.3| 9.0|
+----+-------+-------+-------+
I can select a certain key from a maptype by doing:
df.select('maptypecolumn'.'key')
In my example I did it as follows:
columns= df2.select('metric').rdd.flatMap(lambda x: x).toDF().columns
for i in columns:
df2= df2.withColumn(i,lit(df2.metric[i]))
You can access key and value for example like this:
from pyspark.sql.functions import explode
df.select(explode("custom_dimensions")).select("key")

Scala DataFrame: Explode an array

I am using the spark libraries in Scala. I have created a DataFrame using
val searchArr = Array(
StructField("log",IntegerType,true),
StructField("user", StructType(Array(
StructField("date",StringType,true),
StructField("ua",StringType,true),
StructField("ui",LongType,true))),true),
StructField("what",StructType(Array(
StructField("q1",ArrayType(IntegerType, true),true),
StructField("q2",ArrayType(IntegerType, true),true),
StructField("sid",StringType,true),
StructField("url",StringType,true))),true),
StructField("where",StructType(Array(
StructField("o1",IntegerType,true),
StructField("o2",IntegerType,true))),true)
)
val searchSt = new StructType(searchArr)
val searchData = sqlContext.jsonFile(searchPath, searchSt)
I am now what to explode the field what.q1, which should contain an array of integers, but the documentation is limited:
http://spark.apache.org/docs/1.4.0/api/java/org/apache/spark/sql/DataFrame.html#explode(java.lang.String,%20java.lang.String,%20scala.Function1,%20scala.reflect.api.TypeTags.TypeTag)
So far I tried a few things without much luck
val searchSplit = searchData.explode("q1", "rb")(q1 => q1.getList[Int](0).toArray())
Any ideas/examples of how to use explode on an array?
Did you try with an UDF on field "what"? Something like that could be useful:
val explode = udf {
(aStr: GenericRowWithSchema) =>
aStr match {
case null => ""
case _ => aStr.getList(0).get(0).toString()
}
}
val newDF = df.withColumn("newColumn", explode(col("what")))
where:
getList(0) returns "q1" field
get(0) returns the first element of "q1"
I'm not sure but you could try to use getAs[T](fieldName: String) instead of getList(index: Int).
I'm not used to Scala; but in Python/pyspark, the array type column nested within a struct type field can be exploded as follows. If it works for you, then you can convert it to corresponding Scala representation.
from pyspark.sql.functions import col, explode
from pyspark.sql.types import ArrayType, IntegerType, LongType, StringType, StructField, StructType
schema = StructType([
StructField("log", IntegerType()),
StructField("user", StructType([
StructField("date", StringType()),
StructField("ua", StringType()),
StructField("ui", LongType())])),
StructField("what", StructType([
StructField("q1", ArrayType(IntegerType())),
StructField("q2", ArrayType(IntegerType())),
StructField("sid", StringType()),
StructField("url", StringType())])),
StructField("where", StructType([
StructField("o1", IntegerType()),
StructField("o2", IntegerType())]))
])
data = [((1), ("2022-01-01","ua",1), ([1,2,3],[6],"sid","url"), (7,8))]
df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=False)
Output:
+---+-------------------+--------------------------+------+
|log|user |what |where |
+---+-------------------+--------------------------+------+
|1 |{2022-01-01, ua, 1}|{[1, 2, 3], [6], sid, url}|{7, 8}|
+---+-------------------+--------------------------+------+
With what.q1 exploded:
df.withColumn("what.q1_exploded", explode(col("what.q1"))).show(truncate=False)
Output:
+---+-------------------+--------------------------+------+----------------+
|log|user |what |where |what.q1_exploded|
+---+-------------------+--------------------------+------+----------------+
|1 |{2022-01-01, ua, 1}|{[1, 2, 3], [6], sid, url}|{7, 8}|1 |
|1 |{2022-01-01, ua, 1}|{[1, 2, 3], [6], sid, url}|{7, 8}|2 |
|1 |{2022-01-01, ua, 1}|{[1, 2, 3], [6], sid, url}|{7, 8}|3 |
+---+-------------------+--------------------------+------+----------------+