PySpark Script Replaces All Values

PySpark Script Replaces All Values - pyspark

I have a small PySpark script that I wrote that is looking for a value called resource_tags_user_engagement.
If the value is blank, null or includes a word it should be replaced by a default. But instead of just replacing blank, null or word it's replacing ALL values:
import sys
import pyspark.sql.functions as f
from pyspark.context import SparkContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
# Set Glue Context
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
spark.sql("set spark.sql.parquet.enableVectorizedReader=false")
# Create Dynamic Data Frame from table in the glue database
cost_allocation = glueContext.create_dynamic_frame.from_catalog(database="company_cost_allocation", table_name="company_cost_allocation")
# Convert dynamic frame to dta frame
cost_allocation_df = cost_allocation.toDF()
# Set default engagements
cost_allocation_df = cost_allocation_df.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789101', '123456789102', '123456789103', '123456789104', '123456789105', '123456789106', '123456789107', '123456789108', '123456789109' )) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008378'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
((f.col('line_item_usage_account_id') == f.lit('123456789110')) |
(f.col('line_item_usage_account_id') == f.lit('123456789111'))) &
(f.col('resource_tags_user_engagement') == f.lit('') ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000401'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789112', '123456789113', '123456789114')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000412'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789115', '123456789116', '123456789117', '123456789118', '123456789119', '123456789120', '123456789121', '123456789122', '123456789123')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008692'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789124', '123456789125', '123456789126')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000412'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789127', '123456789128', '123456789129', '123456789130', '123456789131')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '808000000298'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id') == '123456789132') &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '803000006453'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
((f.col('line_item_usage_account_id') == f.lit('123456789133')) |
(f.col('line_item_usage_account_id') == f.lit('123456789134'))) &
(f.col('resource_tags_user_engagement') == f.lit('') ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008426'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
((f.col('line_item_usage_account_id') == f.lit('123456789135')) |
(f.col('line_item_usage_account_id') == f.lit('123456789136'))) &
(f.col('resource_tags_user_engagement') == f.lit('') ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '800000047650'
).otherwise(f.col('resource_tags_user_engagement')))
# Convert back to a DynamicFrame for further processing.
partitioned_dynamicframe = DynamicFrame.fromDF(cost_allocation_df, glueContext, "partitioned_df")
# Repartition the dynamic frame before writing to S3
cost_allocation_df = cost_allocation_df.repartition(5)
# Write to S3
output_dir = "s3://company-cur-reports/company-costs-transformed-legacy-billing"
datasink = glueContext.write_dynamic_frame.from_options(frame = partitioned_dynamicframe, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet", transformation_ctx = "datasink")
Why is it doing that? How can I get the script to replace only values that are null, blank or have a word in them?

Related

Fetch start and end between two dates inclusive in pyspark

I have been trying to fetch months range from 2 given dates, but it's not working as expected.
e.g.
start_date (dd-mm-yyyy) = 12-01-2022
end_date (dd-mm-yyyy) = 03-06-2022
Expected output:
Valid_From
Valid_To
2022-01-12
2022-01-31
2022-02-01
2022-02-28
2022-03-01
2022-03-31
2022-04-01
2022-04-30
2022-05-01
2022-05-31
2022-06-01
2022-06-03
My code:
var_forecast_start_date = datetime.datetime(2022, 1, 12)
var_forecast_end_date = datetime.datetime(2022, 6, 2)
df_datetime = pandas_to_spark(
df_datetime(start=var_forecast_start_date, end=var_forecast_end_date)
)
df_datetime = df_datetime.withColumn(
"DateID", date_format(df_datetime.Date, "yyyyMMdd").cast(IntegerType())
).withColumn("FiscalDate", date_format(df_datetime.Date, "yyyy-MM-dd"))
df_datetime = df_datetime.selectExpr(
"add_months(date_add(last_day(Date),1),-1) AS Valid_From",
"last_day(Date) AS Valid_To",
).distinct()

try maybe the following:
import findspark
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
findspark.init()
spark = SparkSession.builder.appName("local").getOrCreate()
columns = ["start_date", "end_date"]
data = [("12-01-2022", "03-06-2022")]
df = spark.createDataFrame(data).toDF(*columns)
df = (
df.withColumn(
"start_date", F.to_date(F.col("start_date"), "dd-MM-yyyy").cast("DATE")
)
.withColumn(
"end_date", F.to_date(F.col("end_date"), "dd-MM-yyyy").cast("DATE")
)
.withColumn(
"months_between",
F.round(
F.months_between(F.col("end_date"), F.col("start_date"), True)
).cast("Integer"),
)
.withColumn(
"months_between_seq", F.sequence(F.lit(1), F.col("months_between"))
)
.withColumn("months_between_seq", F.explode(F.col("months_between_seq")))
.withColumn(
"end_of_month",
F.expr(
"""
LAST_DAY(ADD_MONTHS(start_date, months_between_seq - 1))
"""
),
)
.withColumn(
"begin_of_month",
F.expr(
"""
LAST_DAY(ADD_MONTHS(start_date, months_between_seq - 1)) + 1
"""
),
)
)
start_window_agg = Window.partitionBy().orderBy("Valid_From")
start_union_sdf = (
df.select(
F.col("start_date").alias("Valid_From")
)
.unionByName(
df.select(
F.col("begin_of_month").alias("Valid_From")
)
)
.drop_duplicates()
.withColumn(
"row_number",
F.row_number().over(start_window_agg)
)
)
end_window_agg = Window.partitionBy().orderBy("Valid_To")
end_union_sdf = (
df.select(
F.col("end_date").alias("Valid_To")
)
.unionByName(
df.select(
F.col("end_of_month").alias("Valid_To")
)
)
.drop_duplicates()
.withColumn(
"row_number",
F.row_number().over(end_window_agg)
)
)
join_sdf = (
end_union_sdf
.join(
start_union_sdf,
how="inner",
on=["row_number"]
)
.drop("row_number")
.withColumn("Valid_To", F.col("Valid_To").cast("DATE"))
.withColumn("Valid_From", F.col("Valid_From").cast("DATE"))
.select("Valid_From", "Valid_To")
.orderBy("Valid_From")
)
join_sdf.show()
It returns:
+----------+----------+
|Valid_From| Valid_To|
+----------+----------+
|2022-01-12|2022-01-31|
|2022-02-01|2022-02-28|
|2022-03-01|2022-03-31|
|2022-04-01|2022-04-30|
|2022-05-01|2022-05-31|
|2022-06-01|2022-06-03|
+----------+----------+

CDC with pyspark

I am trying to write pyspark code to fit into 2 scenarios.
Scenario 1:
Input data:
col1|col2|date
100|Austin|2021-01-10
100|Newyork|2021-02-15
100|Austin|2021-03-02
Expected output with CDC:
col1|col2|start_date|end_date
100|Austin|2021-01-10|2021-02-15
100|Newyork|2021-02-15|2021-03-02
100|Austin|2021-03-02|2099-12-31
In sequence there is a change in col2 values and want to maintain CDC
Scenario 2:
Input:
col1|col2|date
100|Austin|2021-01-10
100|Austin|2021-03-02 -> I want to eliminate this version because there is no change in col1 and col2 values between records.
Expected Output:
col1|col2|start_date|end_date
100|Austin|2021-01-10|2099-12-31
I am looking for the same code to work in both scenarios.
I am trying something like this but not working for both scenarios
inputdf = inputdf.groupBy('col1','col2','date').agg(
F.min("date").alias("r_date"))
inputdf = inputdf.drop("date").withColumnRenamed("r_date", "start_date")
my_allcolumnwindowasc = Window.partitionBy('col1','col2').orderBy("start_date")
inputdf = inputdf.withColumn('dropDuplicates',F.lead(inputdf.start_date).over(my_allcolumnwindowasc)).where(F.col("dropDuplicates").isNotNull()).drop('dropDuplicates')
There are more than 20 columns in some of the scenarios.
Thanks for help!

check this out.
Steps:
Use window function too give the row number
convert the dataframe to view
use self join (condition checks are the key)
use Lead window function wrapped by coalesce in the case of null value to give the "2099-12-31" value
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark = SparkSession \
.builder \
.appName("SO") \
.getOrCreate()
df = spark.createDataFrame(
[(100, "Austin", "2021-01-10"),
(100, "Newyork", "2021-02-15"),
(100, "Austin", "2021-03-02"),
],
['col1', 'col2', 'date']
)
# df = spark.createDataFrame(
# [(100, "Austin", "2021-01-10"),
# (100, "Austin", "2021-03-02"),
# ],
# ['col1', 'col2', 'date']
# )
df1 = df.withColumn("start_date", F.to_date("date"))
w = Window.partitionBy("col1",).orderBy("start_date")
df_1 = df1.withColumn("rn", F.row_number().over(w))
df_1.createTempView("temp_1")
df_dupe = spark.sql('select temp_1.col1,temp_1.col2,temp_1.start_date, case when temp_1.col1=temp_2.col1 and temp_1.col2=temp_2.col2 then "delete" else "no-delete" end as dupe from temp_1 left join temp_1 as temp_2 '
'on temp_1.col1=temp_2.col1 and temp_1.col2=temp_2.col2 and temp_1.rn-1 = temp_2.rn order by temp_1.start_date ')
df_dupe.filter(F.col("dupe")=="no-delete").drop("dupe")\
.withColumn("end_date", F.coalesce(F.lead("start_date").over(w),F.lit("2099-12-31"))).show()
# Result:
# Scenario1:
#+----+-------+----------+----------+
# |col1| col2|start_date| end_date|
# +----+-------+----------+----------+
# | 100| Austin|2021-01-10|2021-02-15|
# | 100|Newyork|2021-02-15|2021-03-02|
# | 100| Austin|2021-03-02|2099-12-31|
# +----+-------+----------+----------+
#
# Scenario 2:
# +----+------+----------+----------+
# |col1| col2|start_date| end_date|
# +----+------+----------+----------+
# | 100|Austin|2021-01-10|2099-12-31|
# +----+------+----------+----------+

Convert string type column to struct and unzip the column using PySpark

Input DF:
+------+-----------------------------------------------------+
|rowNum|infoCol |
+------+-----------------------------------------------------+
|100 |[('john', 'customer'), ('abc, mno, xyz', 'purchase')]|
|200 |[('doe', 'customer')] |
+------+-----------------------------------------------------+
root
|-- rowNum: string (nullable = false)
|-- infoCol: string (nullable = false)
(expected) Output DF:
+------+--------+-----------------+
|rowNum|customer| purchase|
+------+--------+-----------------+
| 100|['john']|['abc, mno, xyz']|
| 100| ['doe']| null|
+------+--------+-----------------+
I have tried using the split function but that doesn't quite do what I need.
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import col, regexp_replace, split
outputdf = inputdf.withColumn("newcol", split(col("infoCol"), ","))

Here is my try with spark built in functions.
Idea here is first create 2 columns with customer,purchase as values and other values in another column, to get these columns I used split then explode.
Once we got customer,purchase values then groupBy+Pivot to pivot the data finally split the columns to get array.
Example:
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import *
inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",split(col("customer"),",")).\
withColumn("purchase",split(col("purchase"),",")).\
show()
#+------+--------+-----------------+
#|rowNum|customer| purchase|
#+------+--------+-----------------+
#| 200| [doe]| null|
#| 100| [john]|[abc, mno, xyz]|
#+------+--------+-----------------+
UPDATE:
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import *
inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",col("customer")).\
withColumn("purchase",col("purchase")).\
show()
#+------+--------+-------------+
#|rowNum|customer| purchase|
#+------+--------+-------------+
#| 200| doe| null|
#| 100| john|abc, mno, xyz|
#+------+--------+-------------+
UPDATE2:
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase'), ('abc123', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import *
inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",expr("""transform(split(regexp_replace(col,"[\[|\]|\(|\)]",""),"',"),x -> regexp_replace(trim(x),"[']",""))""")).\
withColumn("new1",regexp_replace(element_at(col("newCol1"),-1),"[\]]","")).\
withColumn("new2",array_except(col("newCol1"),array(lit('purchase'),lit('customer'),lit('purchase]'),lit('customer]')))).\
withColumn("new2",expr("""transform(new2,x -> concat("'",regexp_replace(x,"[\\\\[]",""),"'"))""")).\
drop(*['col','newCol1']).\
groupby("new1","rowNum").agg(flatten(collect_list(col("new2"))).alias("new2")).\
groupby("rowNum").pivot("new1").agg(first(col("new2"))).\
show(10,False)
#+------+--------+---------------------------+
#|rowNum|customer|purchase |
#+------+--------+---------------------------+
#|200 |['doe'] |null |
#|100 |['john']|['abc, mno, xyz', 'abc123']|
#+------+--------+---------------------------+

Here is my try and this can be used with many columns not only with customer, purchase but if the column name is on the last.
import pyspark.sql.functions as f
df = inputdf \
.withColumn('infoCol', f.regexp_replace('infoCol', '[\[\]]', '')) \
.withColumn('infoCol', f.regexp_replace('infoCol', '(\),)', ') ,')) \
.withColumn('infoCol', f.explode(f.split('infoCol', ' , '))) \
.withColumn('infoCol', f.regexp_replace('infoCol', '[\(\)]', '')) \
.withColumn('infoCol', f.regexp_replace('infoCol', '(\',)', '\' ,')) \
.withColumn('cols', f.split('infoCol', ' , ')[1]) \
.withColumn('cols', f.regexp_replace('cols', '\'', '')) \
.withColumn('infoCol', f.split('infoCol', ' , ')[0]) \
.withColumn('infoCol', f.concat(f.lit('['), f.col('infoCol'), f.lit(']'))) \
values = df.select('cols').distinct().rdd.map(lambda x: x.cols).collect()
df.groupBy('rowNum') \
.pivot('cols', values) \
.agg(f.first('infoCol')) \
.show(10, False)
+------+--------+-----------------+
|rowNum|customer|purchase |
+------+--------+-----------------+
|200 |['doe'] |null |
|100 |['john']|['abc, mno, xyz']|
+------+--------+-----------------+

PySpark - Convert String to Timestamp via When()

I am trying to convert a String to a Timestamp
from pyspark.sql import functions as psf
target_df = df \
.withColumn(
'my_ts',
psf.when(
psf.to_timestamp(psf.col("my_ts"), "dd/MM/yyyy HH:mm:ss").isNotNull(),
psf.to_timestamp("my_ts", "dd/MM/yyyy HH:mm:ss")
) \
.psf.when(
psf.to_timestamp(psf.col("my_ts"), "dd-MMM-yy").isNotNull(),
psf.to_timestamp("my_ts", "dd-MMM-yy")
) \
.psf.when(
psf.to_timestamp(psf.col("my_ts"), "yyyyMMdd").isNotNull(),
psf.to_timestamp("my_ts", "yyyyMMdd")
) \
.otherwise(None)
)
However, I get the following error:
IllegalArgumentException: 'when() can only be applied on a Column previously generated by when() function'
I have tried wrapping the psf.col() around the psf.to_timestamp() but also get an error. Any ideas how to resolve?

You were pretty much there, it's just that the when().psf.when() doesn't work, if you use when directly then it works.
from pyspark.sql import functions as psf
from pyspark.sql.functions import when
df = sqlContext.createDataFrame(
[
["2019-01-12"],
["20190112"],
["12/01/2019 11:22:11"],
["12-Jan-19"]
], ["my_ts"])
target_df = df \
.withColumn(
'my_new_ts',
when(
psf.to_timestamp(psf.col("my_ts"), "dd/MM/yyyy HH:mm:ss").isNotNull(),
psf.to_timestamp("my_ts", "dd/MM/yyyy HH:mm:ss")
) \
.when(
psf.to_timestamp(psf.col("my_ts"), "dd-MMM-yy").isNotNull(),
psf.to_timestamp("my_ts", "dd-MMM-yy")
) \
.when(
psf.to_timestamp(psf.col("my_ts"), "yyyyMMdd").isNotNull(),
psf.to_timestamp("my_ts", "yyyyMMdd")
) \
.otherwise(None)
)
df.show()
target_df.show()
Output:
+-------------------+
| my_ts|
+-------------------+
| 2019-01-12|
| 20190112|
|12/01/2019 11:22:11|
| 12-Jan-19|
+-------------------+
+-------------------+-------------------+
| my_ts| my_new_ts|
+-------------------+-------------------+
| 2019-01-12| null|
| 20190112|2019-01-12 00:00:00|
|12/01/2019 11:22:11|2019-01-12 11:22:11|
| 12-Jan-19|2019-01-12 00:00:00|
+-------------------+-------------------+
Also, if you want a more concise version then you can use psf.coalesce:
from pyspark.sql import functions as psf
target_df = df.select("*",
psf.coalesce(
psf.to_timestamp("my_ts", "dd/MM/yyyy HH:mm:ss"),
psf.to_timestamp("my_ts", "dd-MMM-yy"),
psf.to_timestamp("my_ts", "yyyyMMdd")
).alias("my_new_ts"))

Split and count column values in PySpark dataframe

I have a csv file in hdfs location and have converted to a dataframe and my dataframe looks like below ...
column1,column2,column3
Node1, block1, 1,4,5
Node1, block1, null
Node1, block2, 3,6,7
Node1, block2, null
Node1, block1, null
I would like to parse this dataframe and my output dataframe should below.
column1,column2,column3
Node1, block1, counter0:1,counter1:4,counter2:5
Node1, block1, null
Node1, block2, counter0:3,counter1:6,counter2:7
Node1, block2, null
Node1, block1, null
I am getting some error which is mentioned below so can any please help me how to resolve this error or can help me for correct/modified code? Thank you.
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.types as T
from pyspark.sql.functions import udf
start_value = 2
schema_name = 2
start_key = 0
df = spark.read.csv("hdfs://path/Ccounters/test.csv",header=True)
def dict(x):
split_col = x.split(",")
col_nm = df.schema.names[schema_name]
convert = map(lambda x :col_nm + str(start_key) +":"+str(x) ,split_col)
con_str = ','.join(convert)
return con_str
udf_dict = udf(dict, StringType())
df1 =df.withColumn('distance', udf_dict(df.column3))
df1.show()
getting error below:
File "/opt/data/data11/yarn/local/usercache/cdap/appcache/application_1555606923440_67815/container_e48_1555606923440_67815_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 160, in dump
pickle.PicklingError: Could not serialize object: Py4JError: An error occurred while calling o58.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)

I found that you cannot use spark objects (such as the 'map'-function) inside a UDF, which make sense (https://stackoverflow.com/a/57230637). Alternative way to do the operation that you want, is by using a for-loop in UDF.
1st EDIT
Added a part that can apply this UDF easily to multiple columns, based on the answer of this question: how to get the name of column with maximum value in pyspark dataframe
df = spark.createDataFrame([('Node1', 'block1', '1,4,5', None), ('Node1', 'block1', None, '1,2,3'), ('Node1', 'block2', '3,6,7', None), ('Node1', 'block2', None, '4,5,6'), ('Node1', 'block1', None, '7,8,9')], ['column1', 'column2', 'column3', 'column4'])
# df.show()
# +-------+-------+-------+-------+
# |column1|column2|column3|column4|
# +-------+-------+-------+-------+
# | Node1| block1| 1,4,5| null|
# | Node1| block1| null| 1,2,3|
# | Node1| block2| 3,6,7| null|
# | Node1| block2| null| 4,5,6|
# | Node1| block1| null| 7,8,9|
# +-------+-------+-------+-------+
def columnfill(x):
# if x is empty, return x
if x == None:
return x
else:
split = x.split(',')
y = []
z = 0
for i in split:
y.append('counter'+str(z)+':'+str(i))
z += 1
return ','.join(y)
udf_columnfill = udf(columnfill, StringType())
### Apply UDF to a single column:
# df_result1 = df.withColumn('distance', udf_columnfill(df.column3))
### Code for applying UDF to multiple columns
# Define columns that should be transformed
columnnames = ['column3', 'column4']
# Create a condition that joins multiple string parts, containing column operations
cond = "df.withColumn" + ".withColumn".join(["('" + str(c) + "_new', udf_columnfill(df." + str(c) + ")).drop('"+ str(c) +"')" for c in (columnnames)])
# # Print condition to see which transformations are executed
# print(cond)
# df.withColumn('column3_new', udf_columnfill(df.column3)).drop('column3').withColumn('column4_new', udf_columnfill(df.column4)).drop('column4')
# Create the new dataframe that evaluates the defined condition
df_result2 = eval(cond)
# df_result2.show()
# +-------+-------+--------------------------------+--------------------------------+
# |column1|column2|column3_new |column4_new |
# +-------+-------+--------------------------------+--------------------------------+
# |Node1 |block1 |counter0:1,counter1:4,counter2:5|null |
# |Node1 |block1 |null |counter0:1,counter1:2,counter2:3|
# |Node1 |block2 |counter0:3,counter1:6,counter2:7|null |
# |Node1 |block2 |null |counter0:4,counter1:5,counter2:6|
# |Node1 |block1 |null |counter0:7,counter1:8,counter2:9|
# +-------+-------+--------------------------------+--------------------------------+
2nd EDIT
Added an extra UDF input value where the column name is inserted, being the prefix for the column values:
# Updated UDF
def columnfill(cinput, cname):
# if x is empty, return x
if cinput == None:
return cinput
else:
values = cinput.split(',')
output = []
count = 0
for value in values:
output.append(str(cname)+str(count)+":"+str(value))
count += 1
return ','.join(output)
udf_columnfill = udf(columnfill, StringType())
# Define columns that should be transformed
columnnames = ['column3', 'column4']
# Create a condition that joins multiple string parts, containing column operations
cond2 = "df.withColumn" + ".withColumn".join(["('" + str(c) + "_new', udf_columnfill(df." + str(c) + ", f.lit('" + str(c) + "_new'))).drop('"+ str(c) +"')" for c in (columnnames)])
df_result3 = eval(cond2)
# +-------+-------+--------------------------------------------+--------------------------------------------+
# |column1|column2|column3_new |column4_new |
# +-------+-------+--------------------------------------------+--------------------------------------------+
# |Node1 |block1 |column3_new0:1,column3_new1:4,column3_new2:5|null |
# |Node1 |block1 |null |column4_new0:1,column4_new1:2,column4_new2:3|
# |Node1 |block2 |column3_new0:3,column3_new1:6,column3_new2:7|null |
# |Node1 |block2 |null |column4_new0:4,column4_new1:5,column4_new2:6|
# |Node1 |block1 |null |column4_new0:7,column4_new1:8,column4_new2:9|
# +-------+-------+--------------------------------------------+--------------------------------------------+
print(cond)
# df.withColumn('column3_new', udf_columnfill(df.column3, f.lit('column3_new'))).drop('column3').withColumn('column4_new', udf_columnfill(df.column4, f.lit('column4_new'))).drop('column4')

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

PySpark Script Replaces All Values - pyspark

Related

Fetch start and end between two dates inclusive in pyspark

CDC with pyspark

Convert string type column to struct and unzip the column using PySpark

PySpark - Convert String to Timestamp via When()

Split and count column values in PySpark dataframe

Categories

Resources