How to find rows in df2 which are not available in df1? - pyspark

How I can find the complement of a dataframe with respect of another dataframe?
In pandas it can be done by the following code:
df = df1.merge(df2, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
Example:
+---------+----+
| City|Temp|
+---------+----+
| New York| 59|
| Chicago| 29|
| Tokyo| 73|
| Paris| 56|
|New Delhi| 48|
+---------+----+
+---------+----+
| City|Temp|
+---------+----+
| London| 55|
| New York| 55|
| Tokyo| 73|
|New Delhi| 85|
| Paris| 56|
+---------+----+
Result:
+---------+----+----------+
| City|Temp|_merge |
+---------+----+----------+
| London| 55|right_only|
|New Delhi| 85|right_only|
| New York| 55|right_only|
+---------+----+----------+

You can use subtract.
df = df2.subtract(df1)
Result
+---------+----+
| City|Temp|
+---------+----+
| New York| 55|
| London| 55|
|New Delhi| 85|
+---------+----+

df1.join(df2, ['City', 'Temp'], 'outer').filter(" id1 IS NULL ")
dt1 = [
(0, 'New York', 59),
(1, 'Chicago', 29),
(2, 'Tokyo', 73),
(3, 'Paris', 56),
(4, 'New Delhi', 48),
]
df1 = spark.createDataFrame(dt1, ['id1','City', 'Temp'])
dt2 = [
(0, 'London', 55),
(1, 'New York', 55),
(2, 'Tokyo', 73),
(3, 'New Delhi', 85),
(4, 'Paris', 56),
]
df2 = spark.createDataFrame(dt2, ['id2','City', 'Temp'])
(
df1.join(df2, ['City', 'Temp'], 'outer')
.filter(" id1 IS NULL ")
.sort('id2')
.show(10, False)
)
# +---------+----+----+---+
# |City |Temp|id1 |id2|
# +---------+----+----+---+
# |London |55 |null|0 |
# |New York |55 |null|1 |
# |New Delhi|85 |null|3 |
# +---------+----+----+---+

You can also try "left_anti" join. Its Venn diagram looks like this:
And the code would look like this:
df = (
df2
.join(df1, ['City', 'Temp'], 'left_anti')
)
output:
+---------+----+
| City|Temp|
+---------+----+
| London| 55|
|New Delhi| 85|
| New York| 55|
+---------+----+

Related

pyspark: Merge multiple columns into one column but save the original column name

I have a dataframe having many columns such as v1, v2, v3, and many more. Here only showing v1 and v2. Only one of the columns with the prefix v will have a real number value, and all other columns have a null value.
I want to merge the columns starting with v into one column and create a corresponding column cols to show that value is from which original column. An example of the original table and resultant table are shown below.
Note: Original table has about 200 columns from v1 to v200 and over million rows.
original table
+---------+-----+----+-----+
| org| v1| v2|count|
+---------+-----+----+-----+
| Sh| 46|null| 2|
| Sh| 41|null| 1|
| Sh| null| 4| 3|
| Fi| 30|null| 6|
| Fi| null| 4| 2|
| Xf| null| 2| 1|
| Ai| 27|null| 1|
+---------+-----+----+-----+
result table
+---------+-----+-----+-----+
| org| val| cols|count|
+---------+-----+-----+-----+
| Sh| 46| v1| 2|
| Sh| 41| v1| 1|
| Sh| 4| v2| 3|
| Fi| 30| v1| 6|
| Fi| 4| v2| 2|
| Xf| 2| v2| 1|
| Ai| 27| v1| 1|
+---------+-----+-----+-----+
Sample dataframe:
sample_data = (\
("Sh", 46, None, 2), \
("Sh", 46, None, 1), \
("Sh", None, 4, 3), \
("Fi", 30, None, 6), \
("Fi", None, 4, 2), \
("Xf", None, 2, 1), \
("Ai", 27, None, 1), \
)
columns= [ "org", "v1", "v2", "count"]
df = spark.createDataFrame(data = sample_data, schema = columns)
try this:
import pyspark.sql.functions as f
sample_data = (\
("Sh", 46, None, 2), \
("Sh", 46, None, 1), \
("Sh", None, 4, 3), \
("Fi", 30, None, 6), \
("Fi", None, 4, 2), \
("Xf", None, 2, 1), \
("Ai", 27, None, 1), \
)
columns= [ "org", "v1", "v2", "count"]
columns_of_interest_count = 2
df = (
spark.createDataFrame(data = sample_data, schema = columns)
.withColumn('mapped_columns', f.map_filter(f.map_from_arrays(f.array([f.lit(f'v{str(i)}') for i in range(1, columns_of_interest_count + 1)]), f.array([f.col(f'v{str(i)}') for i in range(1, columns_of_interest_count + 1)])), lambda _, value: ~f.isnull(value)))
.select('org', 'count', f.explode(f.col('mapped_columns')).alias('col', 'val'))
)
df.show(truncate= False)
output:
+---+-----+---+---+
|org|count|col|val|
+---+-----+---+---+
|Sh |2 |v1 |46 |
|Sh |1 |v1 |46 |
|Sh |3 |v2 |4 |
|Fi |6 |v1 |30 |
|Fi |2 |v2 |4 |
|Xf |1 |v2 |2 |
|Ai |1 |v1 |27 |
+---+-----+---+---+
You can create an array of structs and filter that array to keep the structs that are non-null.
data_sdf. \
withColumn('v_structs',
func.array(*[func.struct(func.lit(k).alias('cols'), func.col(k).alias('vals'))
for k in data_sdf.columns if k[0].lower() == 'v']
)
). \
withColumn('v_having_value_structs',
func.expr('filter(v_structs, x -> x.vals is not null)')
). \
select(*data_sdf.columns, func.expr('inline(v_having_value_structs)')). \
show(truncate=False)
# +---+----+----+-----+----+----+
# |org|v1 |v2 |count|cols|vals|
# +---+----+----+-----+----+----+
# |Sh |46 |null|2 |v1 |46 |
# |Sh |46 |null|1 |v1 |46 |
# |Sh |null|4 |3 |v2 |4 |
# |Fi |30 |null|6 |v1 |30 |
# |Fi |null|4 |2 |v2 |4 |
# |Xf |null|2 |1 |v2 |2 |
# |Ai |27 |null|1 |v1 |27 |
# +---+----+----+-----+----+----+
The array of structs would look like this
data_sdf. \
withColumn('v_structs',
func.array(*[func.struct(func.lit(k).alias('cols'), func.col(k).alias('vals'))
for k in data_sdf.columns if k[0].lower() == 'v']
)
). \
withColumn('v_having_value_structs', func.expr('filter(v_structs, x -> x.vals is not null)')). \
show(truncate=False)
# +---+----+----+-----+----------------------+----------------------+
# |org|v1 |v2 |count|v_structs |v_having_value_structs|
# +---+----+----+-----+----------------------+----------------------+
# |Sh |46 |null|2 |[{v1, 46}, {v2, null}]|[{v1, 46}] |
# |Sh |46 |null|1 |[{v1, 46}, {v2, null}]|[{v1, 46}] |
# |Sh |null|4 |3 |[{v1, null}, {v2, 4}] |[{v2, 4}] |
# |Fi |30 |null|6 |[{v1, 30}, {v2, null}]|[{v1, 30}] |
# |Fi |null|4 |2 |[{v1, null}, {v2, 4}] |[{v2, 4}] |
# |Xf |null|2 |1 |[{v1, null}, {v2, 2}] |[{v2, 2}] |
# |Ai |27 |null|1 |[{v1, 27}, {v2, null}]|[{v1, 27}] |
# +---+----+----+-----+----------------------+----------------------+
Yet another approach.
from functools import reduce
from pyspark.sql import functions as F
vcols = [x for x in df.columns if x.startswith('v')]
df = (df.select('*', F.coalesce(*vcols).alias('val'))
.select('org', 'val', 'count', reduce(lambda p, c: p.when(F.col(c) == F.col('val'), F.lit(c)), vcols, F).alias('cols'))
)
First use coalesce to take the first non-null value amongst the columns of interest.
Then use chain of when to match find which columns have the same value as one that I got from coalesce.
This part
reduce(lambda p, c: p.when(F.col(c) == F.col('val'), F.lit(c)), vcols, F)
will generate
(F.when(F.col('v1') == F.col('val'), F.lit('v1'))
.when(F.col('v2') == F.col('val'), F.lit('v2'))
...
)
So, this will give me the column name where the valid value come from.

Pyspark- Fill an empty strings with a '0' if Data type is BIGINT/DOUBLE/Integer

I am trying to fill an empty strings with a '0' if column Data type is BIGINT/DOUBLE/Integer in a dataframe using pyspark
data = [("James","","Smith","36","M",3000,"1.2"),
("Michael","Rose"," ","40","M",4000,"2.0"),
("Robert","","Williams","42","M",4000,"5.0"),
("Maria","Anne"," ","39","F", ," "),
("Jen","Mary","Brown"," ","F",-1,"")
]
schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("age", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True),
StructField("amount", DoubleType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
I am trying like this.
df.select( *[ F.when(F.dtype in ('integertype','doubletype') and F.col(column).ishaving(" "),'0').otherwise(F.col(column)).alias(column) for column in df.columns]).show()
Expected output:
+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
| James| | Smith| 36| M| 3000| 1.2|
| Michael| Rose| | 40| M| 4000| 2.0|
| Robert| |Williams| 42| M| 4000| 5.0|
| Maria| Anne| | 39| F| 0| 0|
| Jen| Mary| Brown| | F| -1| 0|
+---------+----------+--------+---+------+------+------+
You can utilise reduce to accomplish this , it makes the code more cleaner and easier to understand
Additionally create a to_fill list to match the columns based on your condition , which can be further modified based on your scenarios.
Data Preparation
data = [("James","","Smith","36","M",3000,1.2),
("Michael","Rose"," ","40","M",4000,2.0),
("Robert","","Williams","42","M",4000,5.0),
("Maria","Anne"," ","39","F",None,None),
("Jen","Mary","Brown"," ","F",-1,None)
]
schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("age", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True),
StructField("amount", DoubleType(), True)
])
sparkDF = sql.createDataFrame(data=data,schema=schema)
sparkDF.show()
+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
| James| | Smith| 36| M| 3000| 1.2|
| Michael| Rose| | 40| M| 4000| 2.0|
| Robert| |Williams| 42| M| 4000| 5.0|
| Maria| Anne| | 39| F| null| null|
| Jen| Mary| Brown| | F| -1| null|
+---------+----------+--------+---+------+------+------+
Reduce
to_fill = [ c for c,d in sparkDF.dtypes if d in ['int','bigint','double']]
# to_fill --> ['salary','amount']
sparkDF = reduce(
lambda df, x: df.withColumn(x, F.when(F.col(x).isNull(),0).otherwise(F.col(x))),
to_fill,
sparkDF,
)
sparkDF.show()
+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
| James| | Smith| 36| M| 3000| 1.2|
| Michael| Rose| | 40| M| 4000| 2.0|
| Robert| |Williams| 42| M| 4000| 5.0|
| Maria| Anne| | 39| F| 0| 0.0|
| Jen| Mary| Brown| | F| -1| 0.0|
+---------+----------+--------+---+------+------+------+
You can try this :
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.master("local").appName("test").getOrCreate()
data = [("James", "", "Smith", "36", "", 3000, 1.2),
("Michael", "Rose", "", "40", "M", 4000, 2.0),
("Robert", "", "Williams", "42", "M", 4000, 5.0),
("Maria", "Anne", " ", "39", "F", None, None),
("Jen", "Mary", "Brown", " ", "F", -1, None)
]
schema = StructType([StructField("firstname", StringType(), True),StructField("middlename", StringType(), True),StructField("lastname", StringType(), True),StructField("age", StringType(), True),StructField("gender", StringType(), True),StructField("salary", IntegerType(), True),StructField("amount", DoubleType(), True)])
dfa = spark.createDataFrame(data=data, schema=schema)
dfa.show()
def removenull(dfa):
dfa = dfa.select([trim(col(c)).alias(c) for c in dfa.columns])
for i in dfa.columns:
dfa = dfa.withColumn(i , when(col(i)=="", None ).otherwise(col(i)))
return dfa
removenull(dfa).show()
output:
+---------+----------+--------+----+------+------+------+
|firstname|middlename|lastname| age|gender|salary|amount|
+---------+----------+--------+----+------+------+------+
| James| null| Smith| 36| null| 3000| 1.2|
| Michael| Rose| null| 40| M| 4000| 2.0|
| Robert| null|Williams| 42| M| 4000| 5.0|
| Maria| Anne| null| 39| F| null| null|
| Jen| Mary| Brown|null| F| -1| null|
+---------+----------+--------+----+------+------+------+

How to add row index on a group of 2 ordered columns such that the row index restarts for each group, for a pyspark dataframe?

I have a PySpark dataframe-
df1 = spark.createDataFrame([
("u1", 10),
("u1", 20),
("u2", 10),
("u2", 10),
("u2", 30),
],
['user_id', 'var1'])
print(df1.printSchema())
df1.show(truncate=False)
It looks like-
root
|-- user_id: string (nullable = true)
|-- var1: long (nullable = true)
None
+-------+----+
|user_id|var1|
+-------+----+
|u1 |10 |
|u1 |20 |
|u2 |10 |
|u2 |10 |
|u2 |30 |
+-------+----+
I want to give row index in such a way that the indexing restarts for each group on user_id(sorted in ascending order) and var1(sorted in descending order).
The desired output should look like-
+-------+----+-----+
|user_id|var1|order|
+-------+----+-----+
|u1 |10 | 1|
|u1 |20 | 2|
|u2 |10 | 1|
|u2 |10 | 2|
|u2 |30 | 3|
+-------+----+-----+
How do I achieve this?
It's just a row number operation:
from pyspark.sql import functions as F, Window
df2 = df1.withColumn(
'order',
F.row_number().over(Window.partitionBy('user_id').orderBy('var1'))
)
df2.show()
+-------+----+-----+
|user_id|var1|order|
+-------+----+-----+
| u1| 10| 1|
| u1| 20| 2|
| u2| 10| 1|
| u2| 10| 2|
| u2| 30| 3|
+-------+----+-----+

Finding Percentile in Spark-Scala per a group

I am trying to do a percentile over a column using a Window function as below. I have referred here to use the ApproxQuantile definition over a group.
val df1 = Seq(
(1, 10.0), (1, 20.0), (1, 40.6), (1, 15.6), (1, 17.6), (1, 25.6),
(1, 39.6), (2, 20.5), (2 ,70.3), (2, 69.4), (2, 74.4), (2, 45.4),
(3, 60.6), (3, 80.6), (4, 30.6), (4, 90.6)
).toDF("ID","Count")
val idBucketMapping = Seq((1, 4), (2, 3), (3, 2), (4, 2))
.toDF("ID", "Bucket")
//jpp
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile
import org.apache.spark.sql.expressions.Window
object PercentileApprox {
def percentile_approx(col: Column, percentage: Column,
accuracy: Column): Column = {
val expr = new ApproximatePercentile(
col.expr, percentage.expr, accuracy.expr
).toAggregateExpression
new Column(expr)
}
def percentile_approx(col: Column, percentage: Column): Column =
percentile_approx(col, percentage,
lit(ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY))
}
import PercentileApprox._
var res = df1
.withColumn("percentile",
percentile_approx(col("count"), typedLit(doBucketing(2)))
.over(Window.partitionBy("ID"))
)
def doBucketing(bucket_size : Int) = (1 until bucket_size)
.scanLeft(0d)((a, _) => a + (1 / bucket_size.toDouble))
scala> df1.show
+---+-----+
| ID|Count|
+---+-----+
| 1| 10.0|
| 1| 20.0|
| 1| 40.6|
| 1| 15.6|
| 1| 17.6|
| 1| 25.6|
| 1| 39.6|
| 2| 20.5|
| 2| 70.3|
| 2| 69.4|
| 2| 74.4|
| 2| 45.4|
| 3| 60.6|
| 3| 80.6|
| 4| 30.6|
| 4| 90.6|
+---+-----+
scala> idBucketMapping.show
+---+------+
| ID|Bucket|
+---+------+
| 1| 4|
| 2| 3|
| 3| 2|
| 4| 2|
+---+------+
scala> res.show
+---+-----+------------------+
| ID|Count| percentile|
+---+-----+------------------+
| 1| 10.0|[10.0, 20.0, 40.6]|
| 1| 20.0|[10.0, 20.0, 40.6]|
| 1| 40.6|[10.0, 20.0, 40.6]|
| 1| 15.6|[10.0, 20.0, 40.6]|
| 1| 17.6|[10.0, 20.0, 40.6]|
| 1| 25.6|[10.0, 20.0, 40.6]|
| 1| 39.6|[10.0, 20.0, 40.6]|
| 3| 60.6|[60.6, 60.6, 80.6]|
| 3| 80.6|[60.6, 60.6, 80.6]|
| 4| 30.6|[30.6, 30.6, 90.6]|
| 4| 90.6|[30.6, 30.6, 90.6]|
| 2| 20.5|[20.5, 69.4, 74.4]|
| 2| 70.3|[20.5, 69.4, 74.4]|
| 2| 69.4|[20.5, 69.4, 74.4]|
| 2| 74.4|[20.5, 69.4, 74.4]|
| 2| 45.4|[20.5, 69.4, 74.4]|
+---+-----+------------------+
Upto here it is well and good and the logic is simple. But I need results in a dynamic fashion. This means the argument doBucketing(2) to this function should be taken from idBucketMapping based on the ID - Value.
This seems to be little bit tricky for me. Is this possible by any means?
Expected Output --
This means the percentile bucket is based on - idBucketMapping Dataframe .
+---+-----+------------------------+
|ID |Count|percentile |
+---+-----+------------------------+
|1 |10.0 |[10.0, 15.6, 20.0, 39.6]|
|1 |20.0 |[10.0, 15.6, 20.0, 39.6]|
|1 |40.6 |[10.0, 15.6, 20.0, 39.6]|
|1 |15.6 |[10.0, 15.6, 20.0, 39.6]|
|1 |17.6 |[10.0, 15.6, 20.0, 39.6]|
|1 |25.6 |[10.0, 15.6, 20.0, 39.6]|
|1 |39.6 |[10.0, 15.6, 20.0, 39.6]|
|3 |60.6 |[60.6, 60.6] |
|3 |80.6 |[60.6, 60.6] |
|4 |30.6 |[30.6, 30.6] |
|4 |90.6 |[30.6, 30.6] |
|2 |20.5 |[20.5, 45.4, 70.3] |
|2 |70.3 |[20.5, 45.4, 70.3] |
|2 |69.4 |[20.5, 45.4, 70.3] |
|2 |74.4 |[20.5, 45.4, 70.3] |
|2 |45.4 |[20.5, 45.4, 70.3] |
+---+-----+------------------------+
I have a solution for you that is extremely unelegant and works only if you have a limited number of possible bucketing.
My first version is very ugly.
// for the sake of clarity, let's define a function that generates the
// window aggregation
def per(x : Int) = percentile_approx(col("count"), typedLit(doBucketing(x)))
.over(Window.partitionBy("ID"))
// then, we simply try to match the Bucket column with a possible value
val res = df1
.join(idBucketMapping, Seq("ID"))
.withColumn("percentile", when('Bucket === 2, per(2)
.otherwise(when('Bucket === 3, per(3))
.otherwise(per(4)))
)
That's nasty but it works in your case.
Slightly less ugly but very same logic, you can define a set of possible numbers of buckets and use it to do the same thing as above.
val possible_number_of_buckets = 2 to 5
val res = df1
.join(idBucketMapping, Seq("ID"))
.withColumn("percentile", possible_number_of_buckets
.tail
.foldLeft(per(possible_number_of_buckets.head))
((column, size) => when('Bucket === size, per(size))
.otherwise(column)))
percentile_approx takes percentage and accuracy. It seems, they both must be a constant literal. Thus we can't compute the percentile_approx at runtime with dynamically calculated percentage and accuracy.
ref- apache spark git percentile_approx source

Add new column to dataframe based on previous values and condition

I have sample dataframe,
After grouping by level1 and date i got the resulted dataframe:
val group_df = qwe.groupBy($"level1",$"date").agg(sum("rel_amount").as("amount"))
+------+----------+------+
|level1| date|amount|
+------+----------+------+
| A|2016-03-31| 100|
| A|2016-02-28| 100|
| A|2016-01-31| 400|
| A|2015-12-31| 500|
| A|2015-11-30| 1200|
| A|2015-10-31| 1300|
| A|2014-12-31| 600|
| B|2016-03-31| 10|
| B|2016-02-28| 300|
| B|2016-01-31| 423|
| B|2015-12-31| 501|
| B|2015-11-30| 234|
| B|2015-10-31| 1234|
| B|2014-12-31| 3456|
+------+----------+------+
Now I want to add extra column(previous) as year end, in this column I need to get the value for previous year end amount for each group.
For example: for level1 :A, date=2016-03-31 the value should be 500 because it is the amount for 2015-12-31.
Similarily, for date= 2015-12-31 the value should be 600 because the amount for 2014-12-31.Need to calculate the previous year end amount for each row.
Expected output :
+------+----------+------+--------+
|level1| date|amount|Previous|
+------+----------+------+--------+
| A|2016-03-31| 100| 500|
| A|2016-02-28| 100| 500|
| A|2016-01-31| 400| 500|
| A|2015-12-31| 500| 600|
| A|2015-11-30| 1200| 600|
| A|2015-10-31| 1300| 600|
| A|2014-12-31| 600| 600|
| B|2016-03-31| 10| 501|
| B|2016-02-28| 300| 501|
| B|2016-01-31| 423| 501|
| B|2015-12-31| 501| 3456|
| B|2015-11-30| 234| 3456|
| B|2015-10-31| 1234| 3456|
| B|2014-12-31| 3456| 3456|
+------+----------+------+--------+
Can someone help me on this.
One approach would be to use an UDF to manipulate column date as String to create a new column that holds the previous end-of-year value:
val df = Seq(
("A", "2016-03-31", 100),
("A", "2016-02-28", 100),
("A", "2016-01-31", 400),
("A", "2015-12-31", 500),
("A", "2015-11-30", 1200),
("A", "2015-10-31", 1300),
("A", "2014-12-31", 600),
("B", "2016-03-31", 10),
("B", "2016-02-28", 300),
("B", "2016-01-31", 423),
("B", "2015-12-31", 501),
("B", "2015-11-30", 234),
("B", "2015-10-31", 1234),
("B", "2014-12-31", 3456)
).toDF(
"level1", "date", "amount"
)
import org.apache.spark.sql.functions._
def previousEOY = udf( (d: String) => (d.substring(0, 4).toInt - 1).toString + "-12-31" )
val df2 = df.withColumn("previous_eoy", previousEOY($"date"))
For the convenience of standard SQL's scalar subquery capability, I'm reverting to using Spark's TempView (Note that max() is used in the subquery simply to satisfy single-row return):
df2.createOrReplaceTempView("dfView")
val df3 = spark.sqlContext.sql("""
SELECT
level1, date, amount, (
SELECT max(amount) FROM dfView v2
WHERE v2.level1 = v1.level1 AND v2.date = v1.previous_eoy
) previous
FROM
dfView v1
""")
df3.show
+------+----------+------+--------+
|level1| date|amount|previous|
+------+----------+------+--------+
| A|2016-03-31| 100| 500|
| A|2016-02-28| 100| 500|
| A|2016-01-31| 400| 500|
| A|2015-12-31| 500| 600|
| A|2015-11-30| 1200| 600|
| A|2015-10-31| 1300| 600|
| A|2014-12-31| 600| null|
| B|2016-03-31| 10| 501|
| B|2016-02-28| 300| 501|
| B|2016-01-31| 423| 501|
| B|2015-12-31| 501| 3456|
| B|2015-11-30| 234| 3456|
| B|2015-10-31| 1234| 3456|
| B|2014-12-31| 3456| null|
+------+----------+------+--------+
val amount = ss.sparkContext.parallelize(Seq(("B","2014-12-31", 3456))).toDF("level1", "dateY", "amount")
val yearStr = udf((date:String) => {(date.substring(0,4).toInt - 1) +"-12-31" })
val df3 = amount.withColumn( "p", yearStr($"dateY"))
df3.show()
df3.createOrReplaceTempView("dfView")
val df4 = df3.filter( s => s.getString(1).contains("12-31")).select( $"dateY".as("p"), $"level1",$"amount".as("am"))
df4.show
df3.join( df4, Seq("p", "level1"), "left_outer").orderBy("level1", "amount").drop($"p").show()
First, create a dataframe that is year to year-end-value. Then join that into your original data frame where year is equal.