Convert SQL Case Statement into Spark - pyspark

How do I convert this SQL case statement into Spark SQL?
replace_old_engagements_sql = """ UPDATE """ + my_table_name + """
SET Engagement = CASE Engagement
WHEN '800000026680' THEN '800000032764'
WHEN '807000000041' THEN '808000000000'
WHEN '870000012569' THEN '807000000412'
WHEN '807000000279' THEN '808000000223'
WHEN '807000000282' THEN '808000000223'
WHEN '870000000403' THEN '808000000223'
END
WHERE LinkedAccountId in ('123456789101','109876543212') AND Engagement IN ('800000026680', '807000000041', '870000012569', '807000000279', '807000000282', '870000000403'); """

I guess your spark sql would be something close to this.
spark.sql("""
INSERT OVERWRITE TABLE db.my_table_name
SELECT
CASE
WHEN LinkedAccountId in ('123456789101','109876543212') THEN
CASE
WHEN Engagement = '800000026680' THEN '800000032764'
WHEN Engagement = '807000000041' THEN '808000000000'
WHEN Engagement = '870000012569' THEN '807000000412'
WHEN Engagement = '807000000279' THEN '808000000223'
WHEN Engagement = '807000000282' THEN '808000000223'
WHEN Engagement = '870000000403' THEN '808000000223'
ELSE Engagement
END
ELSE Engagement
END as Engagement
from db.my_table_name
""")

# I hope this could be answer for your query in dataframe style which is creating new dataframe with the modified values as per the conditions specified
val out_df = input_df.filter($"LinkedAccountId".isin('123456789101','109876543212') and $"Engagement".isin('800000026680', '807000000041', '870000012569', '807000000279', '807000000282', '870000000403')).withColumn("Engagement",when($"Engagement" === '800000026680', '800000032764').when($"Engagement" === '807000000041','808000000000').when($"Engagement" === '870000012569', '807000000412').when($"Engagement" === '807000000279', '808000000223').when($"Engagement" === '807000000282', '808000000223').when($"Engagement" === '870000000403', '808000000223').otherwise('0'))

Related

Exporting PySpark Dataframe to Azure Data Lake Takes Forever

The code below ran perfectly well on the standalone version of PySpark 2.4 on Mac OS (Python 3.7) when the size of the input data (around 6 GB) was small. However, when I ran the code on HDInsight cluster (HDI 4.0, i.e. Python 3.5, PySpark 2.4, 4 worker nodes and each has 64 cores and 432 GB of RAM, 2 header nodes and each has 4 cores and 28 GB of RAM, 2nd generation of data lake) with larger input data (169 GB), the last step, which is, writing data to the data lake, took forever (I killed it after 24 hours of execution) to complete. Given the fact that HDInsight is not popular in the cloud computing community, I could only reference posts that complained about the low speed when writing dataframe to S3. Some suggested to repartition the dataset, which I did, but it did not help.
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"
def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
print("Creating Spark Environment...")
spark = SparkSession.builder.appName("Menu").getOrCreate()
print("Spark Environment Created!")
print("Working on Checkpoint1...")
orders = spark.read.csv(order_path)
orders.createOrReplaceTempView("orders")
orders = spark.sql(
"SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
)
orders.createOrReplaceTempView("orders")
beer = spark.read.csv(
beer_path,
header=True
)
beer.createOrReplaceTempView("beer")
beer = spark.sql(
"""
SELECT
order_id AS beer_order_id,
in_menu_id AS beer_in_menu_id,
'-999' AS beer_in_menu_name
FROM beer
"""
)
beer.createOrReplaceTempView("beer")
orders = spark.sql(
"""
WITH orders_beer AS (
SELECT *
FROM orders
LEFT JOIN beer
ON orders.in_menu_id = beer.beer_in_menu_id
)
SELECT
order_id,
in_menu_id,
CASE
WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
WHEN beer_in_menu_name IS NULL THEN in_menu_name
END AS menu_name
FROM orders_beer
"""
)
print("Checkpoint1 Completed!")
print("Working on Checkpoint2...")
corpus = spark.read.csv(
corpus_path,
header=True
)
keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
orders = orders.withColumn(
"keyword",
regexp_extract(
"menu_name",
"(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)",
0
)
)
orders.createOrReplaceTempView("orders")
orders = spark.sql("""
SELECT order_id, in_menu_id, keyword
FROM orders
WHERE keyword != ''
""")
orders.createOrReplaceTempView("orders")
orders = orders.groupBy("order_id").agg(
collect_set("keyword").alias("items")
)
print("Checkpoint2 Completed!")
print("Working on Checkpoint3...")
fpGrowth = FPGrowth(
itemsCol="items",
minSupport=0,
minConfidence=0
)
model = fpGrowth.fit(orders)
print("Checkpoint3 Completed!")
print("Working on Checkpoint4...")
frequency = model.freqItemsets
frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
frequency = frequency.withColumn(
"items",
array_remove("items", "-999")
)
frequency = frequency.filter(size(col("items")) > 0)
frequency = frequency.orderBy(asc("items"), desc("freq"))
frequency = frequency.dropDuplicates(["items"])
frequency = frequency.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(frequency.items)
)
frequency.createOrReplaceTempView("frequency")
lift = model.associationRules
lift = lift.drop("confidence")
lift = lift.filter(col("lift") > LIFT_THRESHOLD)
lift = lift.filter(
udf(
lambda x: x == ["-999"], BooleanType()
)(lift.consequent)
)
lift = lift.drop("consequent")
lift = lift.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(lift.antecedent)
)
lift.createOrReplaceTempView("lift")
result = spark.sql(
"""
SELECT lift.antecedent, freq AS frequency, lift
FROM lift
INNER JOIN frequency
ON lift.antecedent = frequency.antecedent
"""
)
print("Checkpoint4 Completed!")
print("Writing Result to Data Lake...")
result.repartition(1024).write.mode("overwrite").parquet(output_path)
print("All Done!")
def main():
work(
order_path=169.1 GB of txt,
beer_path=4.9 GB of csv,
corpus_path=210 KB of csv,
output_path="final_result.parquet"
)
if __name__ == "__main__":
main()
I first thought this was caused by the file format parquet. However, when I tried csv, I met with the same problem. I tried result.count() to see how many rows the table result has. It took forever to get the row number, just like writing the data to the data lake.
There was a suggestion to use broadcast hash join instead of the default sort-merge join if a large dataset is joined with a small dataset. I thought it was worth trying because the smaller samples in the pilot study told me the row number of frequency is roughly 0.09% of that of lift (See the query below if you have difficulty tracking frequency and lift).
SELECT lift.antecedent, freq AS frequency, lift
FROM lift
INNER JOIN frequency
ON lift.antecedent = frequency.antecedent
With that in mind, I revised my code:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"
def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
print("Creating Spark Environment...")
spark = SparkSession.builder.appName("Menu").getOrCreate()
print("Spark Environment Created!")
print("Working on Checkpoint1...")
orders = spark.read.csv(order_path)
orders.createOrReplaceTempView("orders")
orders = spark.sql(
"SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
)
orders.createOrReplaceTempView("orders")
beer = spark.read.csv(
beer_path,
header=True
)
beer.createOrReplaceTempView("beer")
beer = spark.sql(
"""
SELECT
order_id AS beer_order_id,
in_menu_id AS beer_in_menu_id,
'-999' AS beer_in_menu_name
FROM beer
"""
)
beer.createOrReplaceTempView("beer")
orders = spark.sql(
"""
WITH orders_beer AS (
SELECT *
FROM orders
LEFT JOIN beer
ON orders.in_menu_id = beer.beer_in_menu_id
)
SELECT
order_id,
in_menu_id,
CASE
WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
WHEN beer_in_menu_name IS NULL THEN in_menu_name
END AS menu_name
FROM orders_beer
"""
)
print("Checkpoint1 Completed!")
print("Working on Checkpoint2...")
corpus = spark.read.csv(
corpus_path,
header=True
)
keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
orders = orders.withColumn(
"keyword",
regexp_extract(
"menu_name",
"(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)",
0
)
)
orders.createOrReplaceTempView("orders")
orders = spark.sql("""
SELECT order_id, in_menu_id, keyword
FROM orders
WHERE keyword != ''
""")
orders.createOrReplaceTempView("orders")
orders = orders.groupBy("order_id").agg(
collect_set("keyword").alias("items")
)
print("Checkpoint2 Completed!")
print("Working on Checkpoint3...")
fpGrowth = FPGrowth(
itemsCol="items",
minSupport=0,
minConfidence=0
)
model = fpGrowth.fit(orders)
print("Checkpoint3 Completed!")
print("Working on Checkpoint4...")
frequency = model.freqItemsets
frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
frequency = frequency.withColumn(
"antecedent",
array_remove("items", "-999")
)
frequency = frequency.drop("items")
frequency = frequency.filter(size(col("antecedent")) > 0)
frequency = frequency.orderBy(asc("antecedent"), desc("freq"))
frequency = frequency.dropDuplicates(["antecedent"])
frequency = frequency.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(frequency.antecedent)
)
lift = model.associationRules
lift = lift.drop("confidence")
lift = lift.filter(col("lift") > LIFT_THRESHOLD)
lift = lift.filter(
udf(
lambda x: x == ["-999"], BooleanType()
)(lift.consequent)
)
lift = lift.drop("consequent")
lift = lift.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(lift.antecedent)
)
result = lift.join(
frequency.hint("broadcast"),
["antecedent"],
"inner"
)
print("Checkpoint4 Completed!")
print("Writing Result to Data Lake...")
result.repartition(1024).write.mode("overwrite").parquet(output_path)
print("All Done!")
def main():
work(
order_path=169.1 GB of txt,
beer_path=4.9 GB of csv,
corpus_path=210 KB of csv,
output_path="final_result.parquet"
)
if __name__ == "__main__":
main()
The code ran perfectly well with the same sample data on my Mac OS and as expected took less time (34 seconds vs. 26 seconds). Then I decided to run the code to HDInsight with full datasets. In the last step, which is writing data to the data lake, the task failed and I was told Job cancelled because SparkContext was shut down. I am rather new to big data and have no idea with this means. Posts on the internet said there could be many reasons behind it. Whatever the method I should use, how to optimize my code so I can get the desired output in the data lake within bearable amount of time?
I would try several things, ordered by the amount of energy they require:
Check if the ADL storage is in the same region as your HDInsight cluster.
Add calls for df = df.cache() after heavy calculations, or even write and then read the dataframes into and from a cache storage in between these calculations.
Replace your UDFs with "native" Spark code, since UDFs are one of the performance bad practices of Spark.
I have figured it out after five days' struggle. Here are the approaches that I took to optimize the code. The time of code execution dropped from more than 24 hours to around 10 minutes. Code optimization is really really important.
As David Taub below pointed out, use df.cache() after heavy computation or before feeding the data to the model. I used df.cache().count() since calling .cache() on its own is lazily evaluated but the following .count() forces an evaluation of the entire dataset.
Use flashtext instead of regular expression to extract keywords. This greatly improves code performance.
Be careful with joins / merge. It might get extremely slow due to data skewness. Always think about ways to avoid unnecessary joins.
Set minSupport for FPGrowth. This significantly reduces the time when calling model.freqItemsets.

Fetching data from a List

I am new to PySpark and have purchased a book to enhance my PySpark skills. I am stuck while using a function.
Function
def filterDuplicates ( ( userID, ratings ) ):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return movie1 < movie2
I am getting error due to two continuous parenthesis. Step basically gets an RDD which is basically a list of touple as show below:
[(196, ((242, 3.0), (242, 3.0))), (196, ((242, 3.0), (393, 4.0)))]
The final result should be only distinct movie ID, rating BY each viewer.
So in the above-given example, 196 is viewer ID, 242 is movie ID and 3.0 is rating given by viewer.
Kindly advise if I need to download a different version of python to use double parenthesis. Presently I have Python 3.7 installed on my machine.
Thanks,
AJ
The variable names inside a tuple is of no use. If you really want the tuple to be parameter of the function, name the whole tuple like
def filterDuplicates ( userData ):
userId = userData[0]
ratings = userData[1]
movie1 = ratings[0][0]
rating1 = ratings[0][1]
movie2 = ratings[1][0]
rating2 = ratings[1][1]
return movie1 < movie2

SPARK SQL: Implement AND condition inside a CASE statement

I am aware of how to implement a simple CASE-WHEN-THEN clause in SPARK SQL using Scala. I am using Version 1.6.2. But, I need to specify AND condition on multiple columns inside the CASE-WHEN clause. How to achieve this in SPARK using Scala ?
Thanks in advance for your time and help!
Here's the SQL query that I have:
select sd.standardizationId,
case when sd.numberOfShares = 0 and
isnull(sd.derivatives,0) = 0 and
sd.holdingTypeId not in (3,10)
then
8
else
holdingTypeId
end
as holdingTypeId
from sd;
First read table as dataframe
val table = sqlContext.table("sd")
Then select with expression. There align syntaxt according to your database.
val result = table.selectExpr("standardizationId","case when numberOfShares = 0 and isnull(derivatives,0) = 0 and holdingTypeId not in (3,10) then 8 else holdingTypeId end as holdingTypeId")
And show result
result.show
An alternative option, if it's wanted to avoid using the full string expression, is the following:
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions._
val sd = sqlContext.table("sd")
val conditionedColumn: Column = when(
(sd("numberOfShares") === 0) and
(coalesce(sd("derivatives"), lit(0)) === 0) and
(!sd("holdingTypeId").isin(Seq(3,10): _*)), 8
).otherwise(sd("holdingTypeId")).as("holdingTypeId")
val result = sd.select(sd("standardizationId"), conditionedColumn)

combining slick queries into single query

Using Slick 3.1, how do I combine multiple Queries into a single query for the same type? This is not a join or a union, but combining query "segments" to create a single query request. These "segments" can be any individually valid query.
val query = TableQuery[SomeThingValid]
// build up pieces of the query in various parts of the application logic
val q1 = query.filter(_.value > 10)
val q2 = query.filter(_.value < 40)
val q3 = query.sortBy(_.date.desc)
val q4 = query.take(5)
// how to combine these into a single query ?
val finalQ = ??? q1 q2 q3 q4 ???
// in order to run in a single request
val result = DB.connection.run(finalQ.result)
EDIT:
the expected sql should be something like:
SELECT * FROM "SomeThingValid" WHERE "SomeThingValid"."value" > 10 AND "SomeThingValid"."valid" < 40 ORDER BY "MemberFeedItem"."date" DESC LIMIT 5
val q1 = query.filter(_.value > 10)
val q2 = q1.filter(_.value < 40)
val q3 = q2.sortBy(_.date.desc)
val q4 = q3.take(5)
I think you should do something like the above (and pass around Querys) but if you insist on passing around query "segments", something like this could work:
type QuerySegment = Query[SomeThingValid, SomeThingValid, Seq] => Query[SomeThingValid, SomeThingValid, Seq]
val q1: QuerySegment = _.filter(_.value > 10)
val q2: QuerySegment = _.filter(_.value < 40)
val q3: QuerySegment = _.sortBy(_.date.desc)
val q4: QuerySegment = _.take(5)
val finalQ = Function.chain(Seq(q1, q2, q3, q4))(query)
I've used this "pattern" with slick2.0
val query = TableQuery[SomeThingValid]
val flag1, flag3 = false
val flag2, flag4 = true
val bottomFilteredQuery = if(flag1) query.filter(_.value > 10) else query
val topFilteredQuery = if(flag2) bottomFilteredQuery.filter(_.value < 40) else bottomFilteredQuery
val sortedQuery = if(flag3) topFilteredQuery.soryBy(_.date.desc) else topFilteredQuery
val finalQ = if(flag4) sortedQuery.take(5) else sortedQuery
It's Just a worth remark to mention here from the slick essential book, it seems that you might need to avoid combining multiple queries in one single query.
Combining actions to sequence queries is a powerful feature of Slick.
However, you may be able to reduce multiple queries into a single
database query. If you can do that, you’re probably better off doing
it.
I think, it should work. But I didn't test it yet.
val users = TableQuery[Users]
val filter1: Query[Users, User, Seq] = users.filter(condition1)
val filter2: Query[Users, User, Seq] = users.filter(condition2)
(filter1 ++ filter2).result.headOption

Is it possible to pass flags in to regex_matches with PostgreSQL?

I'm trying to select multiple matches from a field, I have the query working as SQL but am at a loss when it comes to the jOOQ version. Here's the SQL:
SELECT
array_to_string(array(select array_to_string(regexp_matches(m.content, '#([a-zA-Z0-9]+)', 'g'), ' ')), ' ') hashtags
FROM tweets
But I can't work out how to pass in the 'g' flag to regexp_matches, and Postgres doesn't support g: style embedded flags. At the moment I'm using (in Scala):
val hashtags = DSL.field(
"array_to_string(array(select array_to_string(regexp_matches(tweets.content, '#([a-zA-Z0-9]+)', 'g'), ' ')), ' ')",
classOf[String])
but that seems kind of gross (but it works, so there's that! 😀).
Current Approach
I have an enum with fields associated with it, like this:
MentionUri(MENTIONS.URL),
Content(MENTIONS.CONTENT),
Hashtags(DSL.field(
"array(select array_to_string(regexp_matches(mentions.content, '#([a-zA-Z0-9]+)', 'g'), ' '))").as("hashtags")),
MediaLinks(DSL.field("json_agg(twitter_media)").as("media_links")),
Location(MENTIONS.LOCATION),
the 'g' flag is needed because I want to get all of the matching fragments from the target text (all of the hashtags from the mention content in this case, where mentions are Tweets, Facebook posts, &c.)
and a class with a bunch of optional search parameters, like this:
minFollowing: Option[Int] = None,
maxFollowing: Option[Int] = None,
workflowStates: Set[WorkflowState] = WorkflowState.values.toSet,
onlyVerifiedUsers: Boolean = false,
and then I build up the query based on a list of these fields
private val fields = v.fields.toSet
override def query(sql: DSLContext): Query = {
val fields = v.fields.map(_.field) ++ Seq(M.PROJECT_ID)
var query = (sql
select fields.toSet.asJava
from M
leftJoin MTM on (M.PROJECT_ID === MTM.PROJECT_ID) and (M.ID === MTM.MENTION_ID)
leftJoin TM on (MTM.URL === TM.URL)
where (M.PROJECT_ID in v.criteria.projectIds.asJava))
if (v.criteria.channels.size < numChannels)
query = query and (M.CHANNEL in v.criteria.channels.asJava)
if (v.criteria.mentionTypes.size < numMentionTypes)
query = query and (M.MENTION_TYPE in v.criteria.mentionTypes.asJava)
// ... more critera get added here, finally ...
if (v.max.isDefined)
query groupBy (M.PROJECT_ID, M.ID, M.CHANNEL, M.USERNAME) limit v.max.get
else
query groupBy (M.PROJECT_ID, M.ID, M.CHANNEL, M.USERNAME)