I am trying to convert a String to a Timestamp
from pyspark.sql import functions as psf
target_df = df \
.withColumn(
'my_ts',
psf.when(
psf.to_timestamp(psf.col("my_ts"), "dd/MM/yyyy HH:mm:ss").isNotNull(),
psf.to_timestamp("my_ts", "dd/MM/yyyy HH:mm:ss")
) \
.psf.when(
psf.to_timestamp(psf.col("my_ts"), "dd-MMM-yy").isNotNull(),
psf.to_timestamp("my_ts", "dd-MMM-yy")
) \
.psf.when(
psf.to_timestamp(psf.col("my_ts"), "yyyyMMdd").isNotNull(),
psf.to_timestamp("my_ts", "yyyyMMdd")
) \
.otherwise(None)
)
However, I get the following error:
IllegalArgumentException: 'when() can only be applied on a Column previously generated by when() function'
I have tried wrapping the psf.col() around the psf.to_timestamp() but also get an error. Any ideas how to resolve?
You were pretty much there, it's just that the when().psf.when() doesn't work, if you use when directly then it works.
from pyspark.sql import functions as psf
from pyspark.sql.functions import when
df = sqlContext.createDataFrame(
[
["2019-01-12"],
["20190112"],
["12/01/2019 11:22:11"],
["12-Jan-19"]
], ["my_ts"])
target_df = df \
.withColumn(
'my_new_ts',
when(
psf.to_timestamp(psf.col("my_ts"), "dd/MM/yyyy HH:mm:ss").isNotNull(),
psf.to_timestamp("my_ts", "dd/MM/yyyy HH:mm:ss")
) \
.when(
psf.to_timestamp(psf.col("my_ts"), "dd-MMM-yy").isNotNull(),
psf.to_timestamp("my_ts", "dd-MMM-yy")
) \
.when(
psf.to_timestamp(psf.col("my_ts"), "yyyyMMdd").isNotNull(),
psf.to_timestamp("my_ts", "yyyyMMdd")
) \
.otherwise(None)
)
df.show()
target_df.show()
Output:
+-------------------+
| my_ts|
+-------------------+
| 2019-01-12|
| 20190112|
|12/01/2019 11:22:11|
| 12-Jan-19|
+-------------------+
+-------------------+-------------------+
| my_ts| my_new_ts|
+-------------------+-------------------+
| 2019-01-12| null|
| 20190112|2019-01-12 00:00:00|
|12/01/2019 11:22:11|2019-01-12 11:22:11|
| 12-Jan-19|2019-01-12 00:00:00|
+-------------------+-------------------+
Also, if you want a more concise version then you can use psf.coalesce:
from pyspark.sql import functions as psf
target_df = df.select("*",
psf.coalesce(
psf.to_timestamp("my_ts", "dd/MM/yyyy HH:mm:ss"),
psf.to_timestamp("my_ts", "dd-MMM-yy"),
psf.to_timestamp("my_ts", "yyyyMMdd")
).alias("my_new_ts"))
Related
I am reading a table to a dataframe which has a column "day_dt" which is in date format "2022/01/08". I want the format to be in "1/8/2022" (M/d/yyyy) Is it possible in pyspark? I have tried using date_format() but resulting in null.
Did you cast day_dt column to timestamp before using date_format? Code below adds a null valued column as you described in your question because it is StringType. You can see it using df.printSchema()
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
d = ['2022/01/08']
df = spark.createDataFrame(d, StringType())
df.show()
df2 = df.withColumn("newDate", date_format(unix_timestamp(df.value ,
"yyyy/mm/dd").cast("timestamp"),"mm/dd/yyyy"))
df2.show()
+----------+
| value|
+----------+
|2022/01/08|
+----------+
+----------+-------+
| value|newDate|
+----------+-------+
|2022/01/08| null|
+----------+-------+
After casting string type to timestamp, date column is formatted properly:
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
d = ['2022/01/08']
df = spark.createDataFrame(d, StringType())
df.show()
df2 = df.withColumn("newDate", date_format(unix_timestamp(df.value , "yyyy/mm/dd").cast("timestamp"),"mm/dd/yyyy"))
df2.show()
+----------+
| value|
+----------+
|2022/01/08|
+----------+
+----------+----------+
| value| newDate|
+----------+----------+
|2022/01/08|01/08/2022|
+----------+----------+
Hope it helps.
If you mean you have date as string in format "yyyy/mm/dd" and you want to convert it to a string with format "M/d/yyyy", then:
First parse string to Date type using to_date().
Then, convert Date type to string using date_format.
df = spark.createDataFrame(data=[["2022/01/01",],["2022/12/31",]], schema=["date_str_in"])
df = df.withColumn("date_dt", F.to_date("date_str_in", format="yyyy/MM/dd"))
df = df.withColumn("date_str_out", F.date_format("date_dt", format="M/d/yyyy"))
+-----------+----------+------------+
|date_str_in| date_dt|date_str_out|
+-----------+----------+------------+
| 2022/01/01|2022-01-01| 1/1/2022|
| 2022/12/31|2022-12-31| 12/31/2022|
+-----------+----------+------------+
Input DF:
+------+-----------------------------------------------------+
|rowNum|infoCol |
+------+-----------------------------------------------------+
|100 |[('john', 'customer'), ('abc, mno, xyz', 'purchase')]|
|200 |[('doe', 'customer')] |
+------+-----------------------------------------------------+
root
|-- rowNum: string (nullable = false)
|-- infoCol: string (nullable = false)
(expected) Output DF:
+------+--------+-----------------+
|rowNum|customer| purchase|
+------+--------+-----------------+
| 100|['john']|['abc, mno, xyz']|
| 100| ['doe']| null|
+------+--------+-----------------+
I have tried using the split function but that doesn't quite do what I need.
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import col, regexp_replace, split
outputdf = inputdf.withColumn("newcol", split(col("infoCol"), ","))
Here is my try with spark built in functions.
Idea here is first create 2 columns with customer,purchase as values and other values in another column, to get these columns I used split then explode.
Once we got customer,purchase values then groupBy+Pivot to pivot the data finally split the columns to get array.
Example:
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import *
inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",split(col("customer"),",")).\
withColumn("purchase",split(col("purchase"),",")).\
show()
#+------+--------+-----------------+
#|rowNum|customer| purchase|
#+------+--------+-----------------+
#| 200| [doe]| null|
#| 100| [john]|[abc, mno, xyz]|
#+------+--------+-----------------+
UPDATE:
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import *
inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",col("customer")).\
withColumn("purchase",col("purchase")).\
show()
#+------+--------+-------------+
#|rowNum|customer| purchase|
#+------+--------+-------------+
#| 200| doe| null|
#| 100| john|abc, mno, xyz|
#+------+--------+-------------+
UPDATE2:
inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase'), ('abc123', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)
from pyspark.sql.functions import *
inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",expr("""transform(split(regexp_replace(col,"[\[|\]|\(|\)]",""),"',"),x -> regexp_replace(trim(x),"[']",""))""")).\
withColumn("new1",regexp_replace(element_at(col("newCol1"),-1),"[\]]","")).\
withColumn("new2",array_except(col("newCol1"),array(lit('purchase'),lit('customer'),lit('purchase]'),lit('customer]')))).\
withColumn("new2",expr("""transform(new2,x -> concat("'",regexp_replace(x,"[\\\\[]",""),"'"))""")).\
drop(*['col','newCol1']).\
groupby("new1","rowNum").agg(flatten(collect_list(col("new2"))).alias("new2")).\
groupby("rowNum").pivot("new1").agg(first(col("new2"))).\
show(10,False)
#+------+--------+---------------------------+
#|rowNum|customer|purchase |
#+------+--------+---------------------------+
#|200 |['doe'] |null |
#|100 |['john']|['abc, mno, xyz', 'abc123']|
#+------+--------+---------------------------+
Here is my try and this can be used with many columns not only with customer, purchase but if the column name is on the last.
import pyspark.sql.functions as f
df = inputdf \
.withColumn('infoCol', f.regexp_replace('infoCol', '[\[\]]', '')) \
.withColumn('infoCol', f.regexp_replace('infoCol', '(\),)', ') ,')) \
.withColumn('infoCol', f.explode(f.split('infoCol', ' , '))) \
.withColumn('infoCol', f.regexp_replace('infoCol', '[\(\)]', '')) \
.withColumn('infoCol', f.regexp_replace('infoCol', '(\',)', '\' ,')) \
.withColumn('cols', f.split('infoCol', ' , ')[1]) \
.withColumn('cols', f.regexp_replace('cols', '\'', '')) \
.withColumn('infoCol', f.split('infoCol', ' , ')[0]) \
.withColumn('infoCol', f.concat(f.lit('['), f.col('infoCol'), f.lit(']'))) \
values = df.select('cols').distinct().rdd.map(lambda x: x.cols).collect()
df.groupBy('rowNum') \
.pivot('cols', values) \
.agg(f.first('infoCol')) \
.show(10, False)
+------+--------+-----------------+
|rowNum|customer|purchase |
+------+--------+-----------------+
|200 |['doe'] |null |
|100 |['john']|['abc, mno, xyz']|
+------+--------+-----------------+
This is the current code:
from pyspark.sql import SparkSession
park_session = SparkSession\
.builder\
.appName("test")\
.getOrCreate()
lines = spark_session\
.readStream\
.format("socket")\
.option("host", "127.0.0.1")\
.option("port", 9998)\
.load()
The 'lines' looks like this:
+-------------+
| value |
+-------------+
| a,b,c |
+-------------+
But I want to look like this:
+---+---+---+
| a | b | c |
+---+---+---+
I tried using the 'split()' method, but it didn't work. You could only split each string into a list in a column, not into multiple columns
What should I do?
Split the value column and by accessing array index (or) element_at(from spark-2.4) (or) getItem() functions to create new columns.
from pyspark.sql.functions import *
lines.withColumn("tmp",split(col("value"),',')).\
withColumn("col1",col("tmp")[0]).\
withColumn("col2",col("tmp").getItem(1)).\
withColumn("col3",element_at(col("tmp"),3))
drop("tmp","value").\
show()
#+----+----+----+
#|col1|col2|col3|
#+----+----+----+
#| a| b| c|
#+----+----+----+
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
spark_session = SparkSession\
.builder\
.appName("test")\
.getOrCreate()
lines = spark_session\
.readStream\
.format("socket")\
.option("host", "127.0.0.1")\
.option("port", 9998)\
.load()
split_col = f.split(lines['value'], ",")
df = df.withColumn('col1', split_col.getItem(0))
df = df.withColumn('col2', split_col.getItem(1))
df = df.withColumn('col2', split_col.getItem(2))
df.show()
Incase you have different numbers of delimiters and not just 3 for each row , you can use the below:
Input:
+-------+
|value |
+-------+
|a,b,c |
|d,e,f,g|
+-------+
Solution
import pyspark.sql.functions as F
max_size = df.select(F.max(F.length(F.regexp_replace('value','[^,]','')))).first()[0]
out = df.select([F.split("value",',')[x].alias(f"Col{x+1}") for x in range(max_size+1)])
Output
out.show()
+----+----+----+----+
|Col1|Col2|Col3|Col4|
+----+----+----+----+
| a| b| c|null|
| d| e| f| g|
+----+----+----+----+
I have a normal timestamp column in my PySpark dataframe. I want to get the starting date of the week from the given date in a new column.
For spark <= 2.2.0
Please use this:
from pyspark.sql.functions import weekofyear, year, to_date, concat, lit, col
from pyspark.sql.session import SparkSession
from pyspark.sql.types import TimestampType
spark = SparkSession.builder.getOrCreate()
spark.createDataFrame([['2020-10-03 05:00:00']], schema=['timestamp']) \
.withColumn('timestamp', col('timestamp').astype(TimestampType())) \
.withColumn('week', weekofyear('timestamp')) \
.withColumn('year', year('timestamp')) \
.withColumn('date_of_the_week', to_date(concat('week', lit('/'), 'year'), "w/yyyy")) \
.show(truncate=False)
+-------------------+----+----+----------------+
|timestamp |week|year|date_of_the_week|
+-------------------+----+----+----------------+
|2020-10-03 05:00:00|40 |2020|2020-09-27 |
+-------------------+----+----+----------------+
For spark > 2.2.0
from pyspark.sql.functions import date_trunc, col
from pyspark.sql.session import SparkSession
from pyspark.sql.types import TimestampType
spark = SparkSession.builder.getOrCreate()
spark.createDataFrame([['2020-10-03 05:00:00']], schema=['timestamp']) \
.withColumn('timestamp', col('timestamp').astype(TimestampType())) \
.withColumn('date_of_the_week', date_trunc(timestamp='timestamp', format='week')) \
.show(truncate=False)
+-------------------+-------------------+
|timestamp |date_of_the_week |
+-------------------+-------------------+
|2020-10-03 05:00:00|2020-09-28 00:00:00|
+-------------------+-------------------+
I've a spark data frame with columns - "date" of type timestamp and "quantity" of type long. For each date, I've some value for quantity. The dates are sorted in increasing order. But there are some dates which are missing.
For eg -
Current df -
Date | Quantity
10-09-2016 | 1
11-09-2016 | 2
14-09-2016 | 0
16-09-2016 | 1
17-09-2016 | 0
20-09-2016 | 2
As you can see, the df has some missing dates like 12-09-2016, 13-09-2016 etc. I want to put 0 in the quantity field for those missing dates such that resultant df should look like -
Date | Quantity
10-09-2016 | 1
11-09-2016 | 2
12-09-2016 | 0
13-09-2016 | 0
14-09-2016 | 0
15-09-2016 | 0
16-09-2016 | 1
17-09-2016 | 0
18-09-2016 | 0
19-09-2016 | 0
20-09-2016 | 2
Any help/suggestion regarding this will be appreciated. Thanks in advance.
Note that I am coding in scala.
I have written this answer in a bit verbose way for easy understanding of the code. It can be optimized.
Needed imports
import java.time.format.DateTimeFormatter
import java.time.{LocalDate, LocalDateTime}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{LongType, TimestampType}
UDFs for String to Valid date format
val date_transform = udf((date: String) => {
val dtFormatter = DateTimeFormatter.ofPattern("d-M-y")
val dt = LocalDate.parse(date, dtFormatter)
"%4d-%2d-%2d".format(dt.getYear, dt.getMonthValue, dt.getDayOfMonth)
.replaceAll(" ", "0")
})
Below UDF code taken from Iterate over dates range
def fill_dates = udf((start: String, excludedDiff: Int) => {
val dtFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
val fromDt = LocalDateTime.parse(start, dtFormatter)
(1 to (excludedDiff - 1)).map(day => {
val dt = fromDt.plusDays(day)
"%4d-%2d-%2d".format(dt.getYear, dt.getMonthValue, dt.getDayOfMonth)
.replaceAll(" ", "0")
})
})
Setting up sample dataframe (df)
val df = Seq(
("10-09-2016", 1),
("11-09-2016", 2),
("14-09-2016", 0),
("16-09-2016", 1),
("17-09-2016", 0),
("20-09-2016", 2)).toDF("date", "quantity")
.withColumn("date", date_transform($"date").cast(TimestampType))
.withColumn("quantity", $"quantity".cast(LongType))
df.printSchema()
root
|-- date: timestamp (nullable = true)
|-- quantity: long (nullable = false)
df.show()
+-------------------+--------+
| date|quantity|
+-------------------+--------+
|2016-09-10 00:00:00| 1|
|2016-09-11 00:00:00| 2|
|2016-09-14 00:00:00| 0|
|2016-09-16 00:00:00| 1|
|2016-09-17 00:00:00| 0|
|2016-09-20 00:00:00| 2|
+-------------------+--------+
Create a temporary dataframe(tempDf) to union with df:
val w = Window.orderBy($"date")
val tempDf = df.withColumn("diff", datediff(lead($"date", 1).over(w), $"date"))
.filter($"diff" > 1) // Pick date diff more than one day to generate our date
.withColumn("next_dates", fill_dates($"date", $"diff"))
.withColumn("quantity", lit("0"))
.withColumn("date", explode($"next_dates"))
.withColumn("date", $"date".cast(TimestampType))
tempDf.show(false)
+-------------------+--------+----+------------------------+
|date |quantity|diff|next_dates |
+-------------------+--------+----+------------------------+
|2016-09-12 00:00:00|0 |3 |[2016-09-12, 2016-09-13]|
|2016-09-13 00:00:00|0 |3 |[2016-09-12, 2016-09-13]|
|2016-09-15 00:00:00|0 |2 |[2016-09-15] |
|2016-09-18 00:00:00|0 |3 |[2016-09-18, 2016-09-19]|
|2016-09-19 00:00:00|0 |3 |[2016-09-18, 2016-09-19]|
+-------------------+--------+----+------------------------+
Now union two dataframes
val result = df.union(tempDf.select("date", "quantity"))
.orderBy("date")
result.show()
+-------------------+--------+
| date|quantity|
+-------------------+--------+
|2016-09-10 00:00:00| 1|
|2016-09-11 00:00:00| 2|
|2016-09-12 00:00:00| 0|
|2016-09-13 00:00:00| 0|
|2016-09-14 00:00:00| 0|
|2016-09-15 00:00:00| 0|
|2016-09-16 00:00:00| 1|
|2016-09-17 00:00:00| 0|
|2016-09-18 00:00:00| 0|
|2016-09-19 00:00:00| 0|
|2016-09-20 00:00:00| 2|
+-------------------+--------+
Based on the #mrsrinivas excellent answer, here is the PySpark version.
Needed imports
from typing import List
import datetime
from pyspark.sql import DataFrame, Window
from pyspark.sql.functions import col, lit, udf, datediff, lead, explode
from pyspark.sql.types import DateType, ArrayType
UDF to create the range of next dates
def _get_next_dates(start_date: datetime.date, diff: int) -> List[datetime.date]:
return [start_date + datetime.timedelta(days=days) for days in range(1, diff)]
Function the create the DateFrame filling the dates (support "grouping" columns):
def _get_fill_dates_df(df: DataFrame, date_column: str, group_columns: List[str], fill_column: str) -> DataFrame:
get_next_dates_udf = udf(_get_next_dates, ArrayType(DateType()))
window = Window.orderBy(*group_columns, date_column)
return df.withColumn("_diff", datediff(lead(date_column, 1).over(window), date_column)) \
.filter(col("_diff") > 1).withColumn("_next_dates", get_next_dates_udf(date_column, "_diff")) \
.withColumn(fill_column, lit("0")).withColumn(date_column, explode("_next_dates")) \
.drop("_diff", "_next_dates")
The usage of the function:
fill_df = _get_fill_dates_df(df, "Date", [], "Quantity")
df = df.union(fill_df)
It assumes that the date column is already in date type.
Here is a slight modification, to use this function with months and enter measure columns (columns that should be set to zero) instead of group columns:
from typing import List
import datetime
from dateutil import relativedelta
import math
import pyspark.sql.functions as f
from pyspark.sql import DataFrame, Window
from pyspark.sql.types import DateType, ArrayType
def fill_time_gaps_date_diff_based(df: pyspark.sql.dataframe.DataFrame, measure_columns: list, date_column: str):
group_columns = [col for col in df.columns if col not in [date_column]+measure_columns]
# save measure sums for qc
qc = df.agg({col: 'sum' for col in measure_columns}).collect()
# convert month to date
convert_int_to_date = f.udf(lambda mth: datetime.datetime(year=math.floor(mth/100), month=mth%100, day=1), DateType())
df = df.withColumn(date_column, convert_int_to_date(date_column))
# sort values
df = df.orderBy(group_columns)
# get_fill_dates_df (instead of months_between also use date_diff for days)
window = Window.orderBy(*group_columns, date_column)
# calculate diff column
fill_df = df.withColumn(
"_diff",
f.months_between(f.lead(date_column, 1).over(window), date_column).cast(IntegerType())
).filter(
f.col("_diff") > 1
)
# generate next dates
def _get_next_dates(start_date: datetime.date, diff: int) -> List[datetime.date]:
return [
start_date + relativedelta.relativedelta(months=months)
for months in range(1, diff)
]
get_next_dates_udf = f.udf(_get_next_dates, ArrayType(DateType()))
fill_df = fill_df.withColumn(
"_next_dates",
get_next_dates_udf(date_column, "_diff")
)
# set measure columns to 0
for col in measure_columns:
fill_df = fill_df.withColumn(col, f.lit(0))
# explode next_dates column
fill_df = fill_df.withColumn(date_column, f.explode('_next_dates'))
# drop unneccessary columns
fill_df = fill_df.drop(
"_diff",
"_next_dates"
)
# union df with fill_df
df = df.union(fill_df)
# qc: should be removed for productive runs
if qc != df.agg({col: 'sum' for col in measure_columns}).collect():
raise ValueError('Sums before and after run do not fit.')
return df
Please note, that I assume that the month is given as Integer in the form YYYYMM. This could easily be adjusted by modifying the "convert month to date" part.