Searching for strings from column and creating new column in spark dataframe with results - scala

I have dataframe consisting of columns. I need to check the http link and stored the value code in new column using spark dataframe.
Dataframe :
colA colB ColC colD
A B C a1.abc.com/823659) a1.abc.com/823521)
B C D go.xyz.com/?LinkID=226971 a1.abc.com/823521)
C D E a1.abc.com/?LinkID=226971 go.xyz.com/?LinkID=226975
Required Output:
colA colB ColC colD ColE
A B C a1.abc.com/823659) a1.abc.com/823521) 823659,823521
B C D go.xyz.com/?LinkID=226971 a1.abc.com/823521) 226971,823521
C D E a1.abc.com/?LinkID=226971 go.xyz.com/?LinkID=226975 226971,226975
df.withColumn("colE", regexp_extract(col("colD"), 'regex', ".com"
I have tried using regexp_extract, dont the pattern in not getting matched. Could you please help to get the required output.

Anyway, run this, you can add the other columns, and I am not sure of your string, but assume spaces. Added to the original answer.
import org.apache.spark.sql.functions._
import spark.implicits._
val df = Seq(("A", "a1.abc.com/823659) a1.abc.com/823521 a1.abc.com/9999)"), ("B", "go.xyz.com/?LinkID=226971 a1.abc.com/823521)") ).toDF("col1", "col2")
val df2 = df.withColumn("col3", split(col("col2"), " "))
val df3 = df2.select($"col1", $"col2", array_except($"col3", array(lit(""))).as("col3"))
val df4 = df3.withColumn("xyz", explode($"col3")).withColumn("leng", length($"xyz"))
// Get values to right of .com
val df5 = df4.withColumn("xyz2", substring_index($"xyz", ".com", -1))
val df6 = df5.withColumn("col4", regexp_replace(df5("xyz2"), "[^0-9]", "") )
val df7 = df6.select($"col1", $"col4")
val dfres = df7.groupBy("col1").agg(collect_list(col("col4")).as("col2"))
dfres.show(false)
returns on my data:
+----+----------------------+
|col1|col2 |
+----+----------------------+
|B |[226971, 823521] |
|A |[823659, 823521, 9999]|
+----+----------------------+

Related

Spark SQL Split or Extract words from String of Words

I have a spark Dataframe like Below.I'm trying to split the column into 2 more columns:
date time content
28may 11am [ssid][customerid,shopid]
val personDF2 = personDF.withColumn("temp",split(col("content"),"\\[")).select(
col("*") +: (0 until 3).map(i => col("temp").getItem(i).as(s/col$i)): _*)
date time content col1 col2 col3
28may 11 [ssid][customerid,shopid] ssid customerid shopid
Assuming a String to represent an Array of Words. Got your request. You can optimize the number of dataframes as well to reduce load on system. If there are more than 9 cols etc. you may need to use c00, c01, etc. for c10 etc. Or just use integer as name for columns. leave that up to you.
import org.apache.spark.sql.functions._
import scala.collection.mutable.WrappedArray
// Set up data
val df = spark.sparkContext.parallelize(Seq(
("A", "[foo][customerid,shopid][Donald,Trump,Esq][single]"),
("B", "[foo]")
)).toDF("k", "v")
val df2 = df.withColumn("words_temp", regexp_replace($"v", lit("]"), lit("" )))
val df3 = df2.withColumn("words_temp2", regexp_replace($"words_temp", lit(","), lit("[" ))).drop("words_temp")
val df4 = df3.withColumn("words_temp3", expr("substring(words_temp2, 2, length(words_temp2))")).withColumn("cnt", expr("length(words_temp2)")).drop("words_temp2")
val df5 = df4.withColumn("words",split(col("words_temp3"),"\\[")).drop("words_temp3")
val df6 = df5.withColumn("num_words", size($"words"))
val df7 = df6.withColumn("v2", explode($"words"))
// Convert to Array of sorts via group by
val df8 = df7.groupBy("k")
.agg(collect_list("v2"))
// Convert to rdd Tuple and then find position so as to gen col names! That is the clue so as to be able to use pivot
val rdd = df8.rdd
val rdd2 = rdd.map(row => (row.getAs[String](0), row.getAs[WrappedArray[String]](1).toArray))
val rdd3 = rdd2.map { case (k, list) => (k, list.zipWithIndex) }
val df9 = rdd3.toDF("k", "v")
val df10 = df9.withColumn("vn", explode($"v"))
val df11 = df10.select($"k", $"vn".getField("_1"), concat(lit("c"),$"vn".getField("_2"))).toDF("k", "v", "c")
// Final manipulation
val result = df11.groupBy("k")
.pivot("c")
.agg(expr("coalesce(first(v),null)")) // May never occur in your case, just done for completeness and variable length cols.
result.show(100,false)
returns in this case:
+---+---+----------+------+------+-----+----+------+
|k |c0 |c1 |c2 |c3 |c4 |c5 |c6 |
+---+---+----------+------+------+-----+----+------+
|B |foo|null |null |null |null |null|null |
|A |foo|customerid|shopid|Donald|Trump|Esq |single|
+---+---+----------+------+------+-----+----+------+
Update:
Based on original title stating array of words. See other answer.
If new, then a few things here. Can also be done with dataset and map I assume. Here is a solution using DFs and rdd's. I might well investigate a complete DS in future, but this works for sure and at scale.
// Can amalgamate more steps
import org.apache.spark.sql.functions._
import scala.collection.mutable.WrappedArray
// Set up data
val df = spark.sparkContext.parallelize(Seq(
("A", Array(Array("foo", "bar"), Array("Donald", "Trump","Esq"), Array("single"))),
("B", Array(Array("foo2", "bar2"), Array("single2"))),
("C", Array(Array("foo3", "bar3", "x", "y", "z")))
)).toDF("k", "v")
// flatten via 2x explode, can be done more elegeantly with def or UDF, but keeping it simple here
val df2 = df.withColumn("v2", explode($"v"))
val df3 = df2.withColumn("v3", explode($"v2"))
// Convert to Array of sorts via group by
val df4 = df3.groupBy("k")
.agg(collect_list("v3"))
// Convert to rdd Tuple and then find position so as to gen col names! That is the clue so as to be able to use pivot
val rdd = df4.rdd
val rdd2 = rdd.map(row => (row.getAs[String](0), row.getAs[WrappedArray[String]](1).toArray))
val rdd3 = rdd2.map { case (k, list) => (k, list.zipWithIndex) }
val df5 = rdd3.toDF("k", "v")
val df6 = df5.withColumn("vn", explode($"v"))
val df7 = df6.select($"k", $"vn".getField("_1"), concat(lit("c"),$"vn".getField("_2"))).toDF("k", "v", "c")
// Final manipulation
val result = df7.groupBy("k")
.pivot("c")
.agg(expr("coalesce(first(v),null)")) // May never occur in your case, just done for completeness and variable length cols.
result.show(100,false)
returns in correct col order:
+---+----+----+-------+-----+----+------+
|k |c0 |c1 |c2 |c3 |c4 |c5 |
+---+----+----+-------+-----+----+------+
|B |foo2|bar2|single2|null |null|null |
|C |foo3|bar3|x |y |z |null |
|A |foo |bar |Donald |Trump|Esq |single|
+---+----+----+-------+-----+----+------+

How to append a string column to array string column in Scala Spark without using UDF?

I have a table which has a column containing array like this -
Student_ID | Subject_List | New_Subject
1 | [Mat, Phy, Eng] | Chem
I want to append the new subject into the subject list and get the new list.
Creating the dataframe -
val df = sc.parallelize(Seq((1, Array("Mat", "Phy", "Eng"), "Chem"))).toDF("Student_ID","Subject_List","New_Subject")
I have tried this with UDF as follows -
def append_list = (arr: Seq[String], s: String) => {
arr :+ s
}
val append_list_UDF = udf(append_list)
val df_new = df.withColumn("New_List", append_list_UDF($"Subject_List",$"New_Subject"))
With UDF, I get the required output
Student_ID | Subject_List | New_Subject | New_List
1 | [Mat, Phy, Eng] | Chem | [Mat, Phy, Eng, Chem]
Can we do it without udf ? Thanks.
In Spark 2.4 or later a combination of array and concat should do the trick,
import org.apache.spark.sql.functions.{array, concat}
import org.apache.spark.sql.Column
def append(arr: Column, col: Column) = concat(arr, array(col))
df.withColumn("New_List", append($"Subject_List",$"New_Subject")).show
+----------+---------------+-----------+--------------------+
|Student_ID| Subject_List|New_Subject| New_List|
+----------+---------------+-----------+--------------------+
| 1|[Mat, Phy, Eng]| Chem|[Mat, Phy, Eng, C...|
+----------+---------------+-----------+--------------------+
but I wouldn't expect serious performance gains here.
val df = Seq((1, Array("Mat", "Phy", "Eng"), "Chem"),
(2, Array("Hindi", "Bio", "Eng"), "IoT"),
(3, Array("Python", "R", "scala"), "C")).toDF("Student_ID","Subject_List","New_Subject")
df.show(false)
val final_df = df.withColumn("exploded", explode($"Subject_List")).select($"Student_ID",$"exploded")
.union(df.select($"Student_ID",$"New_Subject"))
.groupBy($"Student_ID").agg(collect_list($"exploded") as "Your_New_List").show(false)
[enter code here][1]

Spark generate a list of column names that contains(SQL LIKE) a string

This one below is a simple syntax to search for a string in a particular column uisng SQL Like functionality.
val dfx = df.filter($"name".like(s"%${productName}%"))
The questions is How do I grab each and every column NAME that contained the particular string in its VALUES and generate a new column with a list of those "column names" for every row.
So far this is the approach I took but stuck as I cant use spark-sql "Like" function inside a UDF.
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import spark.implicits._
val df1 = Seq(
(0, "mango", "man", "dit"),
(1, "i-man", "man2", "mane"),
(2, "iman", "mango", "ho"),
(3, "dim", "kim", "sim")
).toDF("id", "col1", "col2", "col3")
val df2 = df1.columns.foldLeft(df1) {
(acc: DataFrame, colName: String) =>
acc.withColumn(colName, concat(lit(colName + "="), col(colName)))
}
val df3 = df2.withColumn("merged_cols", split(concat_ws("X", df2.columns.map(c=> col(c)):_*), "X"))
Here is a sample output. Note that here there are only 3 columns but in the real job I'll be reading multiple tables which can contain dynamic number of columns.
+--------------------------------------------+
|id | col1| col2| col3| merged_cols
+--------------------------------------------+
0 | mango| man | dit | col1, col2
1 | i-man| man2 | mane | col1, col2, col3
2 | iman | mango| ho | col1, col2
3 | dim | kim | sim|
+--------------------------------------------+
This can be done using a foldLeft over the columns together with when and otherwise:
val e = "%man%"
val df2 = df1.columns.foldLeft(df.withColumn("merged_cols", lit(""))){(df, c) =>
df.withColumn("merged_cols", when(col(c).like(e), concat($"merged_cols", lit(s"$c,"))).otherwise($"merged_cols"))}
.withColumn("merged_cols", expr("substring(merged_cols, 1, length(merged_cols)-1)"))
All columns that satisfies the condition e will be appended to the string in the merged_cols column. Note that the column must exist for the first append to work so it is added (containing an empty string) to the dataframe when sent into the foldLeft.
The last row in the code simply removes the extra , that is added in the end. If you want the result as an array instead, simply adding .withColumn("merged_cols", split($"merged_cols", ",")) would work.
An alternative appraoch is to instead use an UDF. This could be preferred when dealing with many columns since foldLeft will create multiple dataframe copies. Here regex is used (not the SQL like since that operates on whole columns).
val e = ".*man.*"
val concat_cols = udf((vals: Seq[String], names: Seq[String]) => {
vals.zip(names).filter{case (v, n) => v.matches(e)}.map(_._2)
})
val df2 = df.withColumn("merged_cols", concat_cols(array(df.columns.map(col(_)): _*), typedLit(df.columns.toSeq)))
Note: typedLit can be used in Spark versions 2.2+, when using older versions use array(df.columns.map(lit(_)): _*) instead.

how many ways are there to add a new column to a data frame RDD in Spark API?

I can think of one only using withColumn():
val df = sc.dataFrame.withColumn('newcolname',{ lambda row: row + 1 } )
but how would I generalize this to Text data? For example of my DataFrame had
strning values say "This is an example of a string" and I wanted to extract the
first and last word as in val arraystring : Array[String] = Array(first,last)
Is this the thing you're looking for?
val sc: SparkContext = ...
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val extractFirstWord = udf((sentence: String) => sentence.split(" ").head)
val extractLastWord = udf((sentence: String) => sentence.split(" ").reverse.head)
val sentences = sc.parallelize(Seq("This is an example", "And this is another one", "One_word", "")).toDF("sentence")
val splits = sentences
.withColumn("first_word", extractFirstWord(col("sentence")))
.withColumn("last_word", extractLastWord(col("sentence")))
splits.show()
Then the output is:
+--------------------+----------+---------+
| sentence|first_word|last_word|
+--------------------+----------+---------+
| This is an example| This| example|
|And this is anoth...| And| one|
| One_word| One_word| One_word|
| | | |
+--------------------+----------+---------+
# Create a simple DataFrame, stored into a partition directory
df1 = sqlContext.createDataFrame(sc.parallelize(range(1, 6))\
.map(lambda i: Row(single=i, double=i * 2)))
df1.save("data/test_table/key=1", "parquet")
# Create another DataFrame in a new partition directory,
# adding a new column and dropping an existing column
df2 = sqlContext.createDataFrame(sc.parallelize(range(6, 11))
.map(lambda i: Row(single=i, triple=i * 3)))
df2.save("data/test_table/key=2", "parquet")
# Read the partitioned table
df3 = sqlContext.parquetFile("data/test_table")
df3.printSchema()
https://spark.apache.org/docs/1.3.1/sql-programming-guide.html

scala spark - matching dataframes based on variable dates

I'm trying to match two dataframes based on a variable date window. I am not simply trying to get an exact match, which my code achieves but to get all likely candidates within a variable day window.
I was able to get exact matches on dates with my code.
But I want to find out if the records are still viable to match since they could be a few days off either side but would still be reasonable enough to join on.
I've tried looking for something similar to python's pd.to_timedelta('1 day') in spark to add to the filter but alas have struck no luck.
Here is my current code which matches the dataframe on the ID column and then runs a filter to ensure that the from_date in the second dataframe is between the start_date and the end_date of the first dataframe.
What I need is not the exact date match but be able to match records if they fall between a day or two (either side) of the actual dates.
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df1 = spark.read.option("header","true")
.option("inferSchema","true").csv("../data/df1.csv")
val df2 = spark.read.option("header","true")
.option("inferSchema","true")
.csv("../data/df2.csv")
val df = df2.join(df1,
(df1("ID") === df2("ID")) &&
(df2("from_date") >= df1("start_date")) &&
(df2("from_date") <= df1("end_date")),"left")
.select(df1("ID"), df1("start_date"), df1("end_date"),
$"from_date", $"to_date")
df.coalesce(1).write.format("com.databricks.spark.csv")
.option("header", "true").save("../mydata.csv")
Essentially I want to be able to edit this date window to increase or decrease the data actually matching.
Would really appreciate your input. I'm new to spark/scala but gotta say I'm loving it so far ... soo much faster (and cleaner) than python!
cheers
You can apply date_add and date_sub to start_date/end_date in your join condition, as shown below:
import org.apache.spark.sql.functions._
import java.sql.Date
val df1 = Seq(
(1, Date.valueOf("2018-12-01"), Date.valueOf("2018-12-05")),
(2, Date.valueOf("2018-12-01"), Date.valueOf("2018-12-06")),
(3, Date.valueOf("2018-12-01"), Date.valueOf("2018-12-07"))
).toDF("ID", "start_date", "end_date")
val df2 = Seq(
(1, Date.valueOf("2018-11-30")),
(2, Date.valueOf("2018-12-08")),
(3, Date.valueOf("2018-12-08"))
).toDF("ID", "from_date")
val deltaDays = 1
df2.join( df1,
df1("ID") === df2("ID") &&
df2("from_date") >= date_sub(df1("start_date"), deltaDays) &&
df2("from_date") <= date_add(df1("end_date"), deltaDays),
"left_outer"
).show
// +---+----------+----+----------+----------+
// | ID| from_date| ID|start_date| end_date|
// +---+----------+----+----------+----------+
// | 1|2018-11-30| 1|2018-12-01|2018-12-05|
// | 2|2018-12-08|null| null| null|
// | 3|2018-12-08| 3|2018-12-01|2018-12-07|
// +---+----------+----+----------+----------+
You can get the same results using datediff() function also. Check this out:
scala> val df1 = Seq((1, "2018-12-01", "2018-12-05"),(2, "2018-12-01", "2018-12-06"),(3, "2018-12-01", "2018-12-07")).toDF("ID", "start_date", "end_date").withColumn("start_date",'start_date.cast("date")).withColumn("end_date",'end_date.cast("date"))
df1: org.apache.spark.sql.DataFrame = [ID: int, start_date: date ... 1 more field]
scala> val df2 = Seq((1, "2018-11-30"), (2, "2018-12-08"),(3, "2018-12-08")).toDF("ID", "from_date").withColumn("from_date",'from_date.cast("date"))
df2: org.apache.spark.sql.DataFrame = [ID: int, from_date: date]
scala> val delta = 1;
delta: Int = 1
scala> df2.join(df1,df1("ID") === df2("ID") && datediff('from_date,'start_date) >= -delta && datediff('from_date,'end_date)<=delta, "leftOuter").show(false)
+---+----------+----+----------+----------+
|ID |from_date |ID |start_date|end_date |
+---+----------+----+----------+----------+
|1 |2018-11-30|1 |2018-12-01|2018-12-05|
|2 |2018-12-08|null|null |null |
|3 |2018-12-08|3 |2018-12-01|2018-12-07|
+---+----------+----+----------+----------+
scala>