I have the following df:
KSCHL01 VTEXT01 KWERT01 KSCHL02 VTEXT02 KWERT02 KSCHL03 VTEXT03 KWERT03 id
ZBTB Tarif de base 4455.00 ZBFA Brut facturé 4455.00 ZBN Brut Négocié 3645.00 1
ZBT Brut Tarif. 222.75 ZFIF Remises fin d'ordre 0.00 ZMAJ Majorations 0.00 2
I may have more than 13 columns.
I want to transform every 3 columns slice to a line, to have this EXPECTED OUTPUT:
id KSCHL VTEXT KWERT
1 ZBTB Tarif de base 4455.00
1 ZBFA Brut facturé 4455.00
1 ZBN Brut Négocié 3645.00
2 ZBT Brut Tarif. 222.75
2 ZFIF Remises fin d'ordre 0.00
2 ZMAJ Majorations 0.00
I did this:
for( i <- 0 to df.columns.length-4 by 3){
var temp=df.select(df.columns.slice(i, i+3).map(col(_)): _*)
val columns = temp.columns
val regex = """[0-9]"""
val replacingColumns = columns.map(regex.r.replaceAllIn(_, "")) # delete all digits in column names
val resultDF = replacingColumns.zip(columns).foldLeft(temp){(tempdf, name) => tempdf.withColumnRenamed(name._2, name._1)}
res=res.union(resultDF) # Append df to final DF
}
which gives me this:
KSCHL VTEXT KWERT
ZBTB Tarif de base 4455.00
ZBFA Brut facturé 4455.00
ZBN Brut Négocié 3645.00
ZBT Brut Tarif. 222.75
ZFIF Remises fin d'ordre 0.00
ZMAJ Majorations 0.00
How can I add the id column to every slice in order to have it as a column like in the desired output? I tried:
temp = temp.withColumn("id", df.id)
but I had this error:
error: value id in class Dataset cannot be accessed in org.apache.spark.sql.DataFrame
Thank you.
Here is how you can rewrite the code, Adjust the range dynamically as the numbers of columns
val range = (1 to 3).map(r => if (r < 10) s"0$r" else s"$r")
val structQuery = $"id" +: range.map(n =>
struct($"KSCHL$n".as("KSCHL"), $"VTEXT$n".as("VTEXT"), $"KWERT$n".as("KWERT")).as(s"struct$n")
)
df.select(structQuery: _*)
.withColumn("new", explode(array(range.map(r => col(s"struct$r")): _*)))
.select("id", "new.*")
.show(false)
Output:
+---+-----+-------------------+------+
|id |KSCHL|VTEXT |KWERT |
+---+-----+-------------------+------+
|1 |ZBTB |Tarif de base |4455.0|
|1 |ZBFA |Brut facturé |4455.0|
|1 |ZBN |Brut Négocié |3645.0|
|2 |ZBT |Brut Tarif. |222.75|
|2 |ZFIF |Remises fin d'ordre|0.0 |
|2 |ZMAJ |Majorations |0.0 |
+---+-----+-------------------+------+
Check below code.
scala> df.show(false)
+-------+-------------+-------+-------+-------------------+-------+-------+------------+-------+---+--------------+
|KSCHL01|VTEXT01 |KWERT01|KSCHL02|VTEXT02 |KWERT02|KSCHL03|VTEXT03 |KWERT03|id |KSCHL04 |
+-------+-------------+-------+-------+-------------------+-------+-------+------------+-------+---+--------------+
|ZBTB |Tarif de base|4455.00|ZBFA |Brut facturé |4455.00|ZBN |Brut Négocié|3645.00|1 |sample KSCHL03|
|ZBT |Brut Tarif. |222.75 |ZFIF |Remises fin d'ordre|0.00 |ZMAJ |Majorations |0.00 |2 |sample KSCHL03|
+-------+-------------+-------+-------+-------------------+-------+-------+------------+-------+---+--------------+
scala> val singleColumns = df.columns.filter(c => c.filter(_.isDigit).length == 0).map(col)
singleColumns: Array[org.apache.spark.sql.Column] = Array(id)
scala> val multipleColumns = df.columns.filter(c => c.filter(_.isDigit).length != 0).map(c => (c.filterNot(_.isDigit),c,c.filter(_.isDigit)))
multipleColumns: Array[(String, String, String)] = Array((KSCHL,KSCHL01,01), (VTEXT,VTEXT01,01), (KWERT,KWERT01,01), (KSCHL,KSCHL02,02), (VTEXT,VTEXT02,02), (KWERT,KWERT02,02), (KSCHL,KSCHL03,03), (VTEXT,VTEXT03,03), (KWERT,KWERT03,03), (KSCHL,KSCHL04,04))
scala> val distinctColumns = multipleColumns.map(_._1).distinct
distinctColumns: Array[String] = Array(KSCHL, VTEXT, KWERT)
scala> :paste
// Entering paste mode (ctrl-D to finish)
val colExpr = array(
multipleColumns
.groupBy(_._3)
.map(k => struct(
k._2.map(c => col(c._2).as(c._1)) ++
distinctColumns.filter(c => k._2.filter(_._1 == c).length == 0).map(c => lit("").as(c)) ++
singleColumns:_*
).as("data"))
.toSeq:_*
).as("array_data")
// Exiting paste mode, now interpreting.
colExpr: org.apache.spark.sql.Column = array(named_struct(NamePlaceholder(), KSCHL03 AS `KSCHL`, NamePlaceholder(), VTEXT03 AS `VTEXT`, NamePlaceholder(), KWERT03 AS `KWERT`, NamePlaceholder(), id) AS `data`, named_struct(NamePlaceholder(), KSCHL02 AS `KSCHL`, NamePlaceholder(), VTEXT02 AS `VTEXT`, NamePlaceholder(), KWERT02 AS `KWERT`, NamePlaceholder(), id) AS `data`, named_struct(NamePlaceholder(), KSCHL01 AS `KSCHL`, NamePlaceholder(), VTEXT01 AS `VTEXT`, NamePlaceholder(), KWERT01 AS `KWERT`, NamePlaceholder(), id) AS `data`, named_struct(NamePlaceholder(), KSCHL04 AS `KSCHL`, VTEXT, AS `VTEXT`, KWERT, AS `KWERT`, NamePlaceholder(), id) AS `data`) AS `array_data`
scala> :paste
// Entering paste mode (ctrl-D to finish)
val finalDF = df
.select(colExpr)
.withColumn("array_data",explode_outer($"array_data"))
.select("array_data.*")
// Exiting paste mode, now interpreting.
finalDF.show(false)
+--------------+-------------------+-------+---+
|KSCHL |VTEXT |KWERT |id |
+--------------+-------------------+-------+---+
|ZBN |Brut Négocié |3645.00|1 |
|ZBFA |Brut facturé |4455.00|1 |
|ZBTB |Tarif de base |4455.00|1 |
|sample KSCHL03| | |1 |
|ZMAJ |Majorations |0.00 |2 |
|ZFIF |Remises fin d'ordre|0.00 |2 |
|ZBT |Brut Tarif. |222.75 |2 |
|sample KSCHL03| | |2 |
+--------------+-------------------+-------+---+
Related
I used rdd.collect() to create an Array and now I want to use this Array[Strings] to create a DataFrame. My test file is in the following format(separated by a pipe |).
TimeStamp
IdC
Name
FileName
Start-0f-fields
column01
column02
column03
column04
column05
column06
column07
column08
column010
column11
End-of-fields
Start-of-data
G0002B|0|13|IS|LS|Xys|Xyz|12|23|48|
G0002A|0|13|IS|LS|Xys|Xyz|12|23|45|
G0002x|0|13|IS|LS|Xys|Xyz|12|23|48|
G0002C|0|13|IS|LS|Xys|Xyz|12|23|48|
End-of-data
document
the column name are in between Start-of-field and End-of-Field.
I want to store "| " pipe separated in different columns of Dataframe.
like below example:
column01 column02 column03 column04 column05 column06 column07 column08 column010 column11
G0002C 0 13 IS LS Xys Xyz 12 23 48
G0002x 0 13 LS MS Xys Xyz 14 300 400
my code :
val rdd = sc.textFile("the above text file")
val columns = rdd.collect.slice(5,16).mkString(",") // it will hold columnnames
val data = rdd.collect.slice(5,16)
val rdd1 = sc.parallelize(rdd.collect())
val df = rdd1.toDf(columns)
but this is not giving me the above desired dataframe
Could you try this?
import spark.implicits._ // Add to use `toDS()` and `toDF()`
val rdd = sc.textFile("the above text file")
val columns = rdd.collect.slice(5,16) // `.mkString(",")` is not needed
val dataDS = rdd.collect.slice(5,16)
.map(_.trim()) // to remove whitespaces
.map(s => s.substring(0, s.length - 1)) // to remove last pipe '|'
.toSeq
.toDS
val df = spark.read
.option("header", false)
.option("delimiter", "|")
.csv(dataDS)
.toDF(columns: _*)
df.show(false)
+--------+--------+--------+--------+--------+--------+--------+--------+---------+--------+
|column01|column02|column03|column04|column05|column06|column07|column08|column010|column11|
+--------+--------+--------+--------+--------+--------+--------+--------+---------+--------+
|G0002B |0 |13 |IS |LS |Xys |Xyz |12 |23 |48 |
|G0002A |0 |13 |IS |LS |Xys |Xyz |12 |23 |45 |
|G0002x |0 |13 |IS |LS |Xys |Xyz |12 |23 |48 |
|G0002C |0 |13 |IS |LS |Xys |Xyz |12 |23 |48 |
+--------+--------+--------+--------+--------+--------+--------+--------+---------+--------+
Calling spark.read...csv() method without schema, can take a long time with huge data, because of schema inferences(e,g. Additional reading).
On that case, you can specify schema like below.
/*
column01 STRING,
column02 STRING,
column03 STRING,
...
*/
val schema = columns
.map(c => s"$c STRING")
.mkString(",\n")
val df = spark.read
.option("header", false)
.option("delimiter", "|")
.schema(schema) // schema inferences not occurred
.csv(dataDS)
// .toDF(columns: _*) => unnecessary when schema is specified
If the number of columns and the name of the column are fixed then you can do that as below :
val columns = rdd.collect.slice(5,15).mkString(",") // it will hold columnnames
val data = rdd.collect.slice(17,21)
val d = data.mkString("\n").split('\n').toSeq.toDF()
import org.apache.spark.sql.functions._
val dd = d.withColumn("columnX",split($"value","\\|")).withColumn("column1",$"columnx".getItem(0)).withColumn("column2",$"columnx".getItem(1)).withColumn("column3",$"columnx".getItem(2)).withColumn("column4",$"columnx".getItem(3)).withColumn("column5",$"columnx".getItem(4)).withColumn("column6",$"columnx".getItem(5)).withColumn("column8",$"columnx".getItem(7)).withColumn("column10",$"columnx".getItem(8)).withColumn("column11",$"columnx".getItem(9)).drop("columnX","value")
display(dd)
you can see the output as below:
Assuming the following Dataframe df1 :
df1 :
+---------+--------+-------+
|A |B |C |
+---------+--------+-------+
|toto |tata |titi |
+---------+--------+-------+
I have the N = 3 integer which I want to use in order to create 3 duplicates in the df2 Dataframe using df1 :
df2 :
+---------+--------+-------+
|A |B |C |
+---------+--------+-------+
|toto |tata |titi |
|toto |tata |titi |
|toto |tata |titi |
+---------+--------+-------+
Any ideas ?
From Spark-2.4+ use arrays_zip + array_repeat + explode functions for this case.
val df=Seq(("toto","tata","titi")).toDF("A","B","C")
df.withColumn("arr",explode(array_repeat(arrays_zip(array("A"),array("B"),array("c")),3))).
drop("arr").
show(false)
//or dynamic way
val cols=df.columns.map(x => col(x))
df.withColumn("arr",explode(array_repeat(arrays_zip(array(cols:_*)),3))).
drop("arr").
show(false)
//+----+----+----+
//|A |B |C |
//+----+----+----+
//|toto|tata|titi|
//|toto|tata|titi|
//|toto|tata|titi|
//+----+----+----+
You can use foldLeft along with Dataframe's union
import org.apache.spark.sql.DataFrame
object JoinDataFrames {
def main(args: Array[String]): Unit = {
val spark = Constant.getSparkSess
import spark.implicits._
val df = List(("toto","tata","titi")).toDF("A","B","C")
val N = 3;
val resultDf = (1 until N).foldLeft( df)((dfInner : DataFrame, count : Int) => {
df.union(dfInner)
})
resultDf.show()
}
}
I have a Dataframe:
| ID | TIMESTAMP | VALUE |
1 15:00:01 3
1 17:04:02 2
I want to add a new record with Spark-Scala before with the same time minus 1 second when the value is 2.
The output would be:
| ID | TIMESTAMP | VALUE |
1 15:00:01 3
1 17:04:01 2
1 17:04:02 2
Thanks
You need a .flatMap()
Similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item).
val data = (spark.createDataset(Seq(
(1, "15:00:01", 3),
(1, "17:04:02", 2)
)).toDF("ID", "TIMESTAMP_STR", "VALUE")
.withColumn("TIMESTAMP", $"TIMESTAMP_STR".cast("timestamp").as("TIMESTAMP"))
.drop("TIMESTAMP_STR")
.select("ID", "TIMESTAMP", "VALUE")
)
data.as[(Long, java.sql.Timestamp, Long)].flatMap(r => {
if(r._3 == 2) {
Seq(
(r._1, new java.sql.Timestamp(r._2.getTime() - 1000L), r._3),
(r._1, r._2, r._3)
)
} else {
Some(r._1, r._2, r._3)
}
}).toDF("ID", "TIMESTAMP", "VALUE").show()
Which results in:
+---+-------------------+-----+
| ID| TIMESTAMP|VALUE|
+---+-------------------+-----+
| 1|2019-03-04 15:00:01| 3|
| 1|2019-03-04 17:04:01| 2|
| 1|2019-03-04 17:04:02| 2|
+---+-------------------+-----+
You can introduce a new column array - when value =2 then Array(-1,0) else Array(0), then explode that column and add it with the timestamp as seconds. The below one should work for you. Check this out:
scala> val df = Seq((1,"15:00:01",3),(1,"17:04:02",2)).toDF("id","timestamp","value")
df: org.apache.spark.sql.DataFrame = [id: int, timestamp: string ... 1 more field]
scala> val df2 = df.withColumn("timestamp",'timestamp.cast("timestamp"))
df2: org.apache.spark.sql.DataFrame = [id: int, timestamp: timestamp ... 1 more field]
scala> df2.show(false)
+---+-------------------+-----+
|id |timestamp |value|
+---+-------------------+-----+
|1 |2019-03-04 15:00:01|3 |
|1 |2019-03-04 17:04:02|2 |
+---+-------------------+-----+
scala> val df3 = df2.withColumn("newc", when($"value"===lit(2),lit(Array(-1,0))).otherwise(lit(Array(0))))
df3: org.apache.spark.sql.DataFrame = [id: int, timestamp: timestamp ... 2 more fields]
scala> df3.show(false)
+---+-------------------+-----+-------+
|id |timestamp |value|newc |
+---+-------------------+-----+-------+
|1 |2019-03-04 15:00:01|3 |[0] |
|1 |2019-03-04 17:04:02|2 |[-1, 0]|
+---+-------------------+-----+-------+
scala> val df4 = df3.withColumn("c_explode",explode('newc)).withColumn("timestamp2",to_timestamp(unix_timestamp('timestamp)+'c_explode))
df4: org.apache.spark.sql.DataFrame = [id: int, timestamp: timestamp ... 4 more fields]
scala> df4.select($"id",$"timestamp2",$"value").show(false)
+---+-------------------+-----+
|id |timestamp2 |value|
+---+-------------------+-----+
|1 |2019-03-04 15:00:01|3 |
|1 |2019-03-04 17:04:01|2 |
|1 |2019-03-04 17:04:02|2 |
+---+-------------------+-----+
scala>
If you want the time part alone, then you can do like
scala> df4.withColumn("timestamp",from_unixtime(unix_timestamp('timestamp2),"HH:mm:ss")).select($"id",$"timestamp",$"value").show(false)
+---+---------+-----+
|id |timestamp|value|
+---+---------+-----+
|1 |15:00:01 |3 |
|1 |17:04:01 |2 |
|1 |17:04:02 |2 |
+---+---------+-----+
In My requirment , i come across a situation where i have to pass 2 strings from my dataframe's 2 column and get back the result in string and want to store it back to a dataframe.
Now while passing the value as string, it is always returning the same value. So in all the rows the same value is being populated. (In My case PPPP is being populated in all rows)
Is there a way to pass element (for those 2 columns) from every row and get the result in separate rows.
I am ready to modify my function to accept Dataframe and return Dataframe OR accept arrayOfString and get back ArrayOfString but i dont know how to do that as i am new to programming. Can someone please help me.
Thanks.
def myFunction(key: String , value :String ) : String = {
//Do my functions and get back a string value2 and return this value2 string
value2
}
val DF2 = DF1.select (
DF1("col1")
,DF1("col2")
,DF1("col5") )
.withColumn("anyName", lit(myFunction ( DF1("col3").toString() , DF1("col4").toString() )))
/* DF1:
/*+-----+-----+----------------+------+
/*|col1 |col2 |col3 | col4 | col 5|
/*+-----+-----+----------------+------+
/*|Hello|5 |valueAAA | XXX | 123 |
/*|How |3 |valueCCC | YYY | 111 |
/*|World|5 |valueDDD | ZZZ | 222 |
/*+-----+-----+----------------+------+
/*DF2:
/*+-----+-----+--------------+
/*|col1 |col2 |col5| anyName |
/*+-----+-----+--------------+
/*|Hello|5 |123 | PPPPP |
/*|How |3 |111 | PPPPP |
/*|World|5 |222 | PPPPP |
/*+-----+-----+--------------+
*/
After you define the function, you need to register them as udf(). The udf() function is available in org.apache.spark.sql.functions. check this out
scala> val DF1 = Seq(("Hello",5,"valueAAA","XXX",123),
| ("How",3,"valueCCC","YYY",111),
| ("World",5,"valueDDD","ZZZ",222)
| ).toDF("col1","col2","col3","col4","col5")
DF1: org.apache.spark.sql.DataFrame = [col1: string, col2: int ... 3 more fields]
scala> val DF2 = DF1.select ( DF1("col1") ,DF1("col2") ,DF1("col5") )
DF2: org.apache.spark.sql.DataFrame = [col1: string, col2: int ... 1 more field]
scala> DF2.show(false)
+-----+----+----+
|col1 |col2|col5|
+-----+----+----+
|Hello|5 |123 |
|How |3 |111 |
|World|5 |222 |
+-----+----+----+
scala> DF1.select("*").show(false)
+-----+----+--------+----+----+
|col1 |col2|col3 |col4|col5|
+-----+----+--------+----+----+
|Hello|5 |valueAAA|XXX |123 |
|How |3 |valueCCC|YYY |111 |
|World|5 |valueDDD|ZZZ |222 |
+-----+----+--------+----+----+
scala> def myConcat(a:String,b:String):String=
| return a + "--" + b
myConcat: (a: String, b: String)String
scala>
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala> val myConcatUDF = udf(myConcat(_:String,_:String):String)
myConcatUDF: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function2>,StringType,Some(List(StringType, StringType)))
scala> DF1.select ( DF1("col1") ,DF1("col2") ,DF1("col5"), myConcatUDF( DF1("col3"), DF1("col4"))).show()
+-----+----+----+---------------+
| col1|col2|col5|UDF(col3, col4)|
+-----+----+----+---------------+
|Hello| 5| 123| valueAAA--XXX|
| How| 3| 111| valueCCC--YYY|
|World| 5| 222| valueDDD--ZZZ|
+-----+----+----+---------------+
scala>
I have following Data:
+-----+-----+----+
|Col1 |t0 |t1 |
+-----+-----+----+
| A |null |20 |
| A |20 |40 |
| B |null |10 |
| B |10 |20 |
| B |20 |120 |
| B |120 |140 |
| B |140 |320 |
| B |320 |340 |
| B |340 |360 |
+-----+-----+----+
And what I want is something like this:
+-----+-----+----+----+
|Col1 |t0 |t1 |grp |
+-----+-----+----+----+
| A |null |20 |1A |
| A |20 |40 |1A |
| B |null |10 |1B |
| B |10 |20 |1B |
| B |20 |120 |2B |
| B |120 |140 |2B |
| B |140 |320 |3B |
| B |320 |340 |3B |
| B |340 |360 |3B |
+-----+-----+----+----+
Explanation:
The extra column is based on the Col1 and the difference between t1 and t0.
When the difference between that two is too high => a new number is generated. (in the dataset above when the difference is greater than 50)
I build t0 with:
val windowSpec = Window.partitionBy($"Col1").orderBy("t1")
df = df.withColumn("t0", lag("t1", 1) over windowSpec)
Can someone help me how to do it?
I searched but didn't get a good idea.
I'm a little bit lost because I need the value of the previous calculated row of grp...
Thanks
I solved it myself
val grp = (coalesce(
($"t" - lag($"t", 1).over(windowSpec)),
lit(0)
) > 50).cast("bigint")
df = df.withColumn("grp", sum(grp).over(windowSpec))
With this I don't need both colums (t0 and t1) anymore but can use only t1 (or t) without compute t0.
(I only need to add the value of Col1 but the most important part the number is done and works fine.)
I got the solution from:
Spark SQL window function with complex condition
thanks for your help
You can use udf function to generate the grp column
def testUdf = udf((col1: String, t0: Int, t1: Int)=> (t1-t0) match {
case x : Int if(x > 50) => 2+col1
case _ => 1+col1
})
Call the udf function as
df.withColumn("grp", testUdf($"Col1", $"t0", $"t1"))
The udf function above won't work properly due to null values in t0 which can be replaced by 0
df.na.fill(0)
I hope this is the answer you are searching for.
Edited
Here's the complete solution using udaf . The process is complex . You've already got easy answer but it might help somebody who might use it
First defining udaf
class Boendal extends UserDefinedAggregateFunction {
def inputSchema = new StructType().add("Col1", StringType).add("t0", IntegerType).add("t1", IntegerType).add("rank", IntegerType)
def bufferSchema = new StructType().add("buff", StringType).add("buffer1", IntegerType)
def dataType = StringType
def deterministic = true
def initialize(buffer: MutableAggregationBuffer) = {
buffer.update(0, "")
buffer.update(1, 0)
}
def update(buffer: MutableAggregationBuffer, input: Row) = {
if (!input.isNullAt(0)) {
val buff = buffer.getString(0)
val col1 = input.getString(0)
val t0 = input.getInt(1)
val t1 = input.getInt(2)
val rank = input.getInt(3)
var value = 1
if((t1-t0) < 50)
value = 1
else
value = (t1-t0)/50
val lastValue = buffer(1).asInstanceOf[Integer]
// if(!buff.isEmpty) {
if (value < lastValue)
value = lastValue
// }
buffer.update(1, value)
var finalString = ""
if(buff.isEmpty){
finalString = rank+";"+value+col1
}
else
finalString = buff+"::"+rank+";"+value+col1
buffer.update(0, finalString)
}
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
val buff1 = buffer1.getString(0)
val buff2 = buffer2.getString(0)
buffer1.update(0, buff1+buff2)
}
def evaluate(buffer: Row) : String = {
buffer.getString(0)
}
}
Then some udfs
def rankUdf = udf((grp: String)=> grp.split(";")(0))
def removeRankUdf = udf((grp: String) => grp.split(";")(1))
And finally call the udaf and udfs
val windowSpec = Window.partitionBy($"Col1").orderBy($"t1")
df = df.withColumn("t0", lag("t1", 1) over windowSpec)
.withColumn("rank", rank() over windowSpec)
df = df.na.fill(0)
val boendal = new Boendal
val df2 = df.groupBy("Col1").agg(boendal($"Col1", $"t0", $"t1", $"rank").as("grp2")).withColumnRenamed("Col1", "Col2")
.withColumn("grp2", explode(split($"grp2", "::")))
.withColumn("rank2", rankUdf($"grp2"))
.withColumn("grp2", removeRankUdf($"grp2"))
df = df.join(df2, df("Col1") === df2("Col2") && df("rank") === df2("rank2"))
.drop("Col2", "rank", "rank2")
df.show(false)
Hope it helps