regular expression pyspark dataframe column - pyspark

My dataframe looks like this.
I have a pyspark dataframe and I want to split column A into A1 and A2 like this using regex but that didn't work.
A | A1 | A2
20-13-2012-monday 20-13-2012 monday
20-14-2012-tues 20-14-2012 tues
20-13-2012-wed 20-13-2012 wed
My code looks like this
import re
from pyspark.sql.functions import regexp_extract
reg = r'^([\d]+-[\d]+-[\d]+)'
df=df.withColumn("A1",re.match(reg, df.select(['A'])).group())
df.show()

You can use the regex as an udf and achieve the required output like this:
>>> import re
>>> from pyspark.sql.types import *
>>> from pyspark.sql.functions import udf
>>> def get_date_day(a):
... x, y = re.split('^([\d]+-[\d]+-[\d]+)', a)[1:]
... return [x, y[1:]]
>>> get_date_day('20-13-2012-monday')
['20-13-2012', 'monday']
>>> get_date_day('20-13-2012-monday')
['20-13-2012', '-monday']
>>> get_date_udf = udf(get_date_day, ArrayType(StringType()))
>>> df = sc.parallelize([('20-13-2012-monday',), ('20-14-2012-tues',), ('20-13-2012-wed',)]).toDF(['A'])
>>> df.show()
+-----------------+
| A|
+-----------------+
|20-13-2012-monday|
| 20-14-2012-tues|
| 20-13-2012-wed|
+-----------------+
>>> df = df.withColumn("A12", get_date_udf('A'))
>>> df.show(truncate=False)
+-----------------+--------------------+
|A |A12 |
+-----------------+--------------------+
|20-13-2012-monday|[20-13-2012, monday]|
|20-14-2012-tues |[20-14-2012, tues] |
|20-13-2012-wed |[20-13-2012, wed] |
+-----------------+--------------------+
>>> df = df.withColumn("A1", udf(lambda x:x[0])('A12')).withColumn("A2", udf(lambda x:x[1])('A12'))
>>> df = df.drop('A12')
>>> df.show(truncate=False)
+-----------------+----------+------+
|A |A1 |A2 |
+-----------------+----------+------+
|20-13-2012-monday|20-13-2012|monday|
|20-14-2012-tues |20-14-2012|tues |
|20-13-2012-wed |20-13-2012|wed |
+-----------------+----------+------+
Hope this helps!

Related

Convert date from "yyyy/mm/dd" format to "M/d/yyyy" format in pyspark dataframe

I am reading a table to a dataframe which has a column "day_dt" which is in date format "2022/01/08". I want the format to be in "1/8/2022" (M/d/yyyy) Is it possible in pyspark? I have tried using date_format() but resulting in null.
Did you cast day_dt column to timestamp before using date_format? Code below adds a null valued column as you described in your question because it is StringType. You can see it using df.printSchema()
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
d = ['2022/01/08']
df = spark.createDataFrame(d, StringType())
df.show()
df2 = df.withColumn("newDate", date_format(unix_timestamp(df.value ,
"yyyy/mm/dd").cast("timestamp"),"mm/dd/yyyy"))
df2.show()
+----------+
| value|
+----------+
|2022/01/08|
+----------+
+----------+-------+
| value|newDate|
+----------+-------+
|2022/01/08| null|
+----------+-------+
After casting string type to timestamp, date column is formatted properly:
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
d = ['2022/01/08']
df = spark.createDataFrame(d, StringType())
df.show()
df2 = df.withColumn("newDate", date_format(unix_timestamp(df.value , "yyyy/mm/dd").cast("timestamp"),"mm/dd/yyyy"))
df2.show()
+----------+
| value|
+----------+
|2022/01/08|
+----------+
+----------+----------+
| value| newDate|
+----------+----------+
|2022/01/08|01/08/2022|
+----------+----------+
Hope it helps.
If you mean you have date as string in format "yyyy/mm/dd" and you want to convert it to a string with format "M/d/yyyy", then:
First parse string to Date type using to_date().
Then, convert Date type to string using date_format.
df = spark.createDataFrame(data=[["2022/01/01",],["2022/12/31",]], schema=["date_str_in"])
df = df.withColumn("date_dt", F.to_date("date_str_in", format="yyyy/MM/dd"))
df = df.withColumn("date_str_out", F.date_format("date_dt", format="M/d/yyyy"))
+-----------+----------+------------+
|date_str_in| date_dt|date_str_out|
+-----------+----------+------------+
| 2022/01/01|2022-01-01| 1/1/2022|
| 2022/12/31|2022-12-31| 12/31/2022|
+-----------+----------+------------+

How is the string column in DataFrame split into multiple columns when Spark Structed Streaming

This is the current code:
from pyspark.sql import SparkSession
park_session = SparkSession\
.builder\
.appName("test")\
.getOrCreate()
lines = spark_session\
.readStream\
.format("socket")\
.option("host", "127.0.0.1")\
.option("port", 9998)\
.load()
The 'lines' looks like this:
+-------------+
| value |
+-------------+
| a,b,c |
+-------------+
But I want to look like this:
+---+---+---+
| a | b | c |
+---+---+---+
I tried using the 'split()' method, but it didn't work. You could only split each string into a list in a column, not into multiple columns
What should I do?
Split the value column and by accessing array index (or) element_at(from spark-2.4) (or) getItem() functions to create new columns.
from pyspark.sql.functions import *
lines.withColumn("tmp",split(col("value"),',')).\
withColumn("col1",col("tmp")[0]).\
withColumn("col2",col("tmp").getItem(1)).\
withColumn("col3",element_at(col("tmp"),3))
drop("tmp","value").\
show()
#+----+----+----+
#|col1|col2|col3|
#+----+----+----+
#| a| b| c|
#+----+----+----+
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
spark_session = SparkSession\
.builder\
.appName("test")\
.getOrCreate()
lines = spark_session\
.readStream\
.format("socket")\
.option("host", "127.0.0.1")\
.option("port", 9998)\
.load()
split_col = f.split(lines['value'], ",")
df = df.withColumn('col1', split_col.getItem(0))
df = df.withColumn('col2', split_col.getItem(1))
df = df.withColumn('col2', split_col.getItem(2))
df.show()
Incase you have different numbers of delimiters and not just 3 for each row , you can use the below:
Input:
+-------+
|value |
+-------+
|a,b,c |
|d,e,f,g|
+-------+
Solution
import pyspark.sql.functions as F
max_size = df.select(F.max(F.length(F.regexp_replace('value','[^,]','')))).first()[0]
out = df.select([F.split("value",',')[x].alias(f"Col{x+1}") for x in range(max_size+1)])
Output
out.show()
+----+----+----+----+
|Col1|Col2|Col3|Col4|
+----+----+----+----+
| a| b| c|null|
| d| e| f| g|
+----+----+----+----+

Pyspark substring is not working inside of UDF

I'm trying in vain to use a Pyspark substring function inside of an UDF. Below is my code snippet -
from pyspark.sql.functions import substring
def my_udf(my_str):
try:
my_sub_str = substring(my_str,1, 2)
except Exception:
pass
else:
return (my_sub_str)
apply_my_udf = udf(my_udf)
df = input_data.withColumn("sub_str", apply_my_udf(input_data.col0))
The sample data is-
ABC1234
DEF2345
GHI3456
But when I print the df, I don't get any value in the new column "sub_str" as shown below -
[Row(col0='ABC1234', sub_str=None), Row(col0='DEF2345', sub_str=None), Row(col0='GHI3456', sub_str=None)]
Can anyone please let me know what I'm doing wrong?
You don't need a udf to use substring, here's a cleaner and faster way:
>>> from pyspark.sql import functions as f
>>> df.show()
+-------+
| data|
+-------+
|ABC1234|
|DEF2345|
|GHI3456|
+-------+
>>> df.withColumn("sub_str", f.substring("data", 1, 2)).show()
+-------+-------+
| data|sub_str|
+-------+-------+
|ABC1234| AB|
|DEF2345| DE|
|GHI3456| GH|
+-------+-------+
If you need to use udf for that, you could also try something like:
input_data = spark.createDataFrame([
(1,"ABC1234"),
(2,"DEF2345"),
(3,"GHI3456")
], ("id","col0"))
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
udf1 = udf(lambda x:x[0:2],StringType())
df.withColumn('sub_str',udf1('col0')).show()
+---+-------+-------+
| id| col0|sub_str|
+---+-------+-------+
| 1|ABC1234| AB|
| 2|DEF2345| DE|
| 3|GHI3456| GH|
+---+-------+-------+
However, as Mohamed Ali JAMAOUI wrote - you could do without udf easily here.

Date format in pyspark

My data frame looks like -
id date
1 2018-08-23 11:48:22
2 2019-05-03 06:22:01
3 2019-05-13 10:12:15
4 2019-01-22 16:13:29
5 2018-11-27 11:17:19
My expected output is -
id date date1
1 2018-08-23 11:48:22 2018-08
2 2019-05-03 06:22:01 2019-05
3 2019-05-13 10:12:15 2019-05
4 2019-01-22 16:13:29 2019-01
5 2018-11-27 11:17:19 2018-11
How to do it in pyspark?
I think you are trying to drop day and time details, you can use date_format function for it
>>> df.show()
+---+-------------------+
| id| date|
+---+-------------------+
| 1|2018-08-23 11:48:22|
| 2|2019-05-03 06:22:01|
| 3|2019-05-13 10:12:15|
| 4|2019-01-22 16:13:29|
| 5|2018-11-27 11:17:19|
+---+-------------------+
>>> import pyspark.sql.functions as F
>>>
>>> df.withColumn('date1',F.date_format(F.to_date('date','yyyy-MM-dd HH:mm:ss'),'yyyy-MM')).show()
+---+-------------------+-------+
| id| date| date1|
+---+-------------------+-------+
| 1|2018-08-23 11:48:22|2018-08|
| 2|2019-05-03 06:22:01|2019-05|
| 3|2019-05-13 10:12:15|2019-05|
| 4|2019-01-22 16:13:29|2019-01|
| 5|2018-11-27 11:17:19|2018-11|
+---+-------------------+-------+
via to_date and then substr functions ... example:
import pyspark.sql.functions as F
import pyspark.sql.types as T
rawData = [(1, "2018-08-23 11:48:22"),
(2, "2019-05-03 06:22:01"),
(3, "2019-05-13 10:12:15")]
df = spark.createDataFrame(rawData).toDF("id","my_date")
df.withColumn("new_my_date",\
F.substring(F.to_date(F.col("my_date")), 1,7))\
.show()
+---+-------------------+-----------+
| id| my_date|new_my_date|
+---+-------------------+-----------+
| 1|2018-08-23 11:48:22| 2018-08|
| 2|2019-05-03 06:22:01| 2019-05|
| 3|2019-05-13 10:12:15| 2019-05|
+---+-------------------+-----------+
import pyspark.sql.functions as F
split_col = F.split(df['date'], '-')
df = df.withColumn('year', split_col.getItem(0)).withColumn('month', split_col.getItem(1))
df = df.select(F.concat(df['year'], F.lit('-'),df['month']).alias('year_month'))
df.show()
+----------+
|year_month|
+----------+
| 2018-08|
| 2019-05|
| 2019-05|
| 2019-01|
| 2018-11|
+----------+

Split string on custom Delimiter in pyspark

I have data with column foo which can be
foo
abcdef_zh
abcdf_grtyu_zt
pqlmn#xl
from here I want to create two columns such that
Part 1 Part 2
abcdef zh
abcdf_grtyu zt
pqlmn xl
The code I am using for this is
data = data.withColumn("Part 1",split(data["foo"],substring(data["foo"],-3,1))).get_item(0)
data = data.withColumn("Part 2",split(data["foo"],substring(data["foo"],-3,1))).get_item(1)
However I am getting an error column not iterable
The following should work
>>> from pyspark.sql import Row
>>> from pyspark.sql.functions import expr
>>> df = sc.parallelize(['abcdef_zh', 'abcdfgrtyu_zt', 'pqlmn#xl']).map(lambda x: Row(x)).toDF(["col1"])
>>> df.show()
+-------------+
| col1|
+-------------+
| abcdef_zh|
|abcdfgrtyu_zt|
| pqlmn#xl|
+-------------+
>>> df.withColumn('part2',df.col1.substr(-2, 3)).withColumn('part1', expr('substr(col1, 1, length(col1)-3)')).select('part1', 'part2').show()
+----------+-----+
| part1|part2|
+----------+-----+
| abcdef| zh|
|abcdfgrtyu| zt|
| pqlmn| xl|
+----------+-----+