Pyspark Counter based on column condition - pyspark

Hello I would like to create a new column with a counter based on the condition of Tag1 column.
I have this:
Time
Tag1
1
0
2
1
3
1
4
1
5
0
6
0
7
1
8
1
9
1
10
1
11
0
12
0
And I would like this:
Time
Tag1
Counter
1
0
0
2
1
1
3
1
2
4
1
3
5
0
0
6
0
0
7
1
1
8
1
2
9
1
3
10
1
4
11
0
0
12
0
0
I tried with function.when(df.Tag1 == 1, function.lag(df.Tag1)+1).otherwise(0) but doesn't work.
Any idea?
Thanks a lot

Window function
new = (df.withColumn('Counter',(col('Tag1')=='0'))#Create Bool
.withColumn('Counter', F.sum(F.col('Counter').cast('integer')).over(Window.partitionBy().orderBy().rowsBetween(-sys.maxsize, 0)))# Create Group by summing bool
.withColumn('Counter', when(col('Tag1')==0, col('Tag1')).otherwise(F.sum('Tag1').over(Window.partitionBy('Counter').orderBy().rowsBetween(-sys.maxsize, 0))))#Conditionally add
)
new.show()
+----+----+-------+
|Time|Tag1|Counter|
+----+----+-------+
| 1| 0| 0|
| 2| 1| 1|
| 3| 1| 2|
| 4| 1| 3|
| 5| 0| 0|
| 6| 0| 0|
| 7| 1| 1|
| 8| 1| 2|
| 9| 1| 3|
| 10| 1| 4|
| 11| 0| 0|
| 12| 0| 0|
+----+----+-------+

Related

How to update a column conditionally pyspark

I have a pyspark dataframe df :-
status
Flag
present
1
present
0
na
1
Void
0
present
1
notpresent
0
present
0
present
0
ok
1
I want to update the Flag as 1 wherever we have status is present or ok :-
Expected :-
status
Flag
present
1
present
1
na
1
Void
0
present
1
notpresent
0
present
1
present
1
ok
1
You can do so using withColumn and a check using when. You recreate the Flag column setting it to 1 if status is ok or present, otherwise you keep the existing value.
from pyspark.sql.functions import when, col, lit
data = [
('present', 0),
('ok', 0),
('present', 1),
('void', 0),
('na', 1),
('notpresent', 0)
]
df = spark.createDataFrame(data, ['status', 'Flag'])
df.show()
df.withColumn('Flag', when(col('status').isin(['ok', 'present']), lit(1)).otherwise(col('Flag'))).show()
Output
+----------+----+
| status|Flag|
+----------+----+
| present| 0|
| ok| 0|
| present| 1|
| void| 0|
| na| 1|
|notpresent| 0|
+----------+----+
+----------+----+
| status|Flag|
+----------+----+
| present| 1|
| ok| 1|
| present| 1|
| void| 0|
| na| 1|
|notpresent| 0|
+----------+----+
The simplest way
df.withColumn('Flag', col('status').isin(['ok', 'present']).astype('int')).show()
+----------+----+
| status|Flag|
+----------+----+
| present| 1|
| ok| 1|
| present| 1|
| void| 0|
| na| 1|
|notpresent| 0|
+----------+----+

Spark dataframe - transform rows with same ID to columns

I want to transform below source dataframe (using pyspark):
Key
ID
segment
1
A
m1
2
A
m1
3
B
m1
4
C
m2
1
D
m1
2
E
m1
3
F
m1
4
G
m2
1
J
m1
2
J
m1
3
J
m1
4
K
m2
Into below result dataframe:
ID
key1
key2
A
1
2
B
3
-
C
4
-
D
1
-
F
3
-
G
4
-
J
1
2
J
1
3
J
2
3
K
4
-
In other words: I want to highlight the "pairs" in the dataframe - If I have more than one key for the same ID, I would like to point each relation in diferents lines.
Thank you for your help
Use window functions. I assume - means a one man group. If not you can use when/otherwise contion to blank the 1s out.
w =Window.partitionBy('ID').orderBy(desc('Key'))
df= (df.withColumn('key2', lag('segment').over(w))# create new column with value of preceding segment for each row
.withColumn('key2', col('key2').isNotNull())# query to create boolean selection
.withColumn('key2',F.sum(F.col('key2').cast('integer')).over(w.rowsBetween(Window.currentRow, sys.maxsize))+1)#Create cumulative groups
.orderBy('ID', 'key')#Reorder frame
)
df.show()
+---+---+-------+----+
|Key| ID|segment|key2|
+---+---+-------+----+
| 1| A| m1| 2|
| 2| A| m1| 2|
| 3| B| m1| 1|
| 4| C| m2| 1|
| 1| D| m1| 1|
| 2| E| m1| 1|
| 3| F| m1| 1|
| 4| G| m2| 1|
| 1| J| m1| 2|
| 2| J| m1| 3|
| 3| J| m1| 3|
| 4| K| m2| 1|
+---+---+-------+----+

create another columns for checking different value in pyspark

I wish to have below expected output:
My code:
import numpy as np
pd_dataframe = pd.DataFrame({'id': [i for i in range(10)],
'values': [10,5,3,-1,0,-10,-4,10,0,10]})
sp_dataframe = spark.createDataFrame(pd_dataframe)
sign_acc_row = F.udf(lambda x: int(np.sign(x)), IntegerType())
sp_dataframe = sp_dataframe.withColumn('sign', sign_acc_row('values'))
sp_dataframe.show()
I wanted to create another column with which it returns an additional of 1 when the value is different from previous row.
Expected output:
id values sign numbering
0 0 10 1 1
1 1 5 1 1
2 2 3 1 1
3 3 -1 -1 2
4 4 0 0 3
5 5 -10 -1 4
6 6 -4 -1 4
7 7 10 1 5
8 8 0 0 6
9 9 10 1 7
Here's a way you can do using a custom function:
import pyspark.sql.functions as F
# compare the next value with previous
def f(x):
c = 1
l = [c]
last_value = [x[0]]
for i in x[1:]:
if i == last_value[-1]:
l.append(c)
else:
c += 1
l.append(c)
last_value.append(i)
return l
# take sign column as a list
sign_list = sp_dataframe.select('sign').rdd.map(lambda x: x.sign).collect()
# create a new dataframe using the output
sp = spark.createDataFrame(pd.DataFrame(f(sign_list), columns=['numbering']))
Append a list as a column to a dataframe is a bit tricky in pyspark. For this we'll need to create a dummy row_idx to join the dataframes.
# create dummy indexes
sp_dataframe = sp_dataframe.withColumn("row_idx", F.monotonically_increasing_id())
sp = sp.withColumn("row_idx", F.monotonically_increasing_id())
# join the dataframes
final_df = (sp_dataframe
.join(sp, sp_dataframe.row_idx == sp.row_idx)
.orderBy('id')
.drop("row_idx"))
final_df.show()
+---+------+----+---------+
| id|values|sign|numbering|
+---+------+----+---------+
| 0| 10| 1| 1|
| 1| 5| 1| 1|
| 2| 3| 1| 1|
| 3| -1| -1| 2|
| 4| 0| 0| 3|
| 5| -10| -1| 4|
| 6| -4| -1| 4|
| 7| 10| 1| 5|
| 8| 0| 0| 6|
| 9| 10| 1| 7|
+---+------+----+---------+

Getting first row based on condition

I have a dataframe where I want to get the first row where the indicator column is 0. For example, my dataframe will look like this:
network volume indicator Hour
YYY 20 1 10
YYY 30 0 9
YYY 40 0 8
YYY 80 1 7
TTT 50 0 10
TTT 40 1 8
TTT 10 0 4
TTT 10 1 2
The result should look like this:
network volume indicator Hour
YYY 20 1 10
YYY 30 0 9
YYY 80 1 7
TTT 50 0 10
TTT 40 1 8
TTT 10 1 2
So the ones with ones would still stay, while I get the first time the indicator was 0 for each network. I want to have everything sorted by hour in descending order when I do this, so I get the most recent 0 indicator. How do I go about achieving this result?
Here is your required code, with comments inline to help you understand too: (updated the output with your latest dataset, with multiple 1's in the indicator column)
sourceData.show()
+-------+------+---------+----+
|network|volume|indicator|Hour|
+-------+------+---------+----+
| YYY| 20| 1| 10|
| YYY| 30| 0| 9|
| YYY| 40| 0| 8|
| YYY| 80| 1| 7|
| TTT| 50| 0| 10|
| TTT| 40| 1| 8|
| TTT| 10| 0| 4|
| TTT| 10| 1| 2|
+-------+------+---------+----+
sourceData.printSchema()
root
|-- network: string (nullable = true)
|-- volume: integer (nullable = true)
|-- indicator: integer (nullable = true)
|-- Hour: integer (nullable = true)
Required Transformation Code:
//splitting your data set into two parts with indicator 1 and 0
val indicator1Df = sourceData.filter("indicator == 1")
val indicator0Df = sourceData.filter("indicator == 0")
//getting the first row for all indicator=0
indicator0Df.createOrReplaceTempView("indicator0")
val firstIndicator0df = spark.sql("select network, volume, indicator, hour from (select i0.network,i0.volume,i0.indicator,i0.hour,ROW_NUMBER() over (partition by i0.network order by i0.Hour desc) as rnk from indicator0 i0) i where rnk = 1")
//merging both the dataframes back to for your required output result
val finalDf = indicator1Df.union(firstIndicator0df).orderBy($"network".desc,$"Hour".desc)
finalDf.show()
Final Output:
+-------+------+---------+----+
|network|volume|indicator|Hour|
+-------+------+---------+----+
| YYY| 20| 1| 10|
| YYY| 30| 0| 9|
| YYY| 80| 1| 7|
| TTT| 50| 0| 10|
| TTT| 40| 1| 8|
| TTT| 10| 1| 2|
+-------+------+---------+----+

Pyspark - Get first column occurrence of a value in a spark dataframe

I have a data-frame as below, I need first, last occurrence of the value 0 and non zero values
Id Col1 Col2 Col3 Col4
1 1 0 0 2
2 0 0 0 0
3 4 2 2 4
4 2 5 9 0
5 0 4 0 0
Expected Result:
Id Col1 Col2 Col3 Col4 First_0 Last_0 First_non_zero Last_non_zero
1 1 0 0 2 2 3 1 4
2 0 0 0 0 1 4 0 0
3 4 2 2 4 0 0 1 4
4 2 5 9 0 4 4 1 3
5 0 4 0 0 1 4 2 2
Here is one way to use pyspark's F.array(), F.greatest() and F.least():
from pyspark.sql import functions as F
df = spark.createDataFrame(
[(1,1,0,0,2), (2,0,0,0,0), (3,4,2,2,4), (4,2,5,9,0), (5,0,4,0,0)]
, ['Id','Col1','Col2','Col3','Col4']
)
df.show()
#+---+----+----+----+----+
#| Id|Col1|Col2|Col3|Col4|
#+---+----+----+----+----+
#| 1| 1| 0| 0| 2|
#| 2| 0| 0| 0| 0|
#| 3| 4| 2| 2| 4|
#| 4| 2| 5| 9| 0|
#| 5| 0| 4| 0| 0|
#+---+----+----+----+----+
# column names involved in the calculation
cols = df.columns[1:]
# create an array column `arr_0` with index of elements(having F.col(cols[index])==0) in array cols
# then select the greatest and least value to identify the first_0 and last_0
# fillna with '0' when none of the items is '0'
df.withColumn('arr_0', F.array([ F.when(F.col(cols[i])==0, i+1) for i in range(len(cols))])) \
.withColumn('first_0', F.least(*[F.col('arr_0')[i] for i in range(len(cols))])) \
.withColumn('last_0', F.greatest(*[F.col('arr_0')[i] for i in range(len(cols))])) \
.fillna(0, subset=['first_0', 'last_0']) \
.show()
#+---+----+----+----+----+------------+-------+------+
#| Id|Col1|Col2|Col3|Col4| arr_0|first_0|last_0|
#+---+----+----+----+----+------------+-------+------+
#| 1| 1| 0| 0| 2| [, 2, 3,]| 2| 3|
#| 2| 0| 0| 0| 0|[1, 2, 3, 4]| 1| 4|
#| 3| 4| 2| 2| 4| [,,,]| 0| 0|
#| 4| 2| 5| 9| 0| [,,, 4]| 4| 4|
#| 5| 0| 4| 0| 0| [1,, 3, 4]| 1| 4|
#+---+----+----+----+----+------------+-------+------+
If you are using pyspark 2.4, you can also try F.array_min() and F.array_max():
df.withColumn('arr_0', F.array([ F.when(F.col(cols[i])==0, i+1) for i in range(len(cols)) ])) \
.select('*', F.array_min('arr_0').alias('first_0'), F.array_max('arr_0').alias('last_0')) \
.fillna(0, subset=['first_0', 'last_0']) \
.show()
#+---+----+----+----+----+------------+-------+------+
#| Id|Col1|Col2|Col3|Col4| arr_0|first_0|last_0|
#+---+----+----+----+----+------------+-------+------+
#| 1| 1| 0| 0| 2| [, 2, 3,]| 2| 3|
#| 2| 0| 0| 0| 0|[1, 2, 3, 4]| 1| 4|
#| 3| 4| 2| 2| 4| [,,,]| 0| 0|
#| 4| 2| 5| 9| 0| [,,, 4]| 4| 4|
#| 5| 0| 4| 0| 0| [1,, 3, 4]| 1| 4|
#+---+----+----+----+----+------------+-------+------+
You can do the same to last_non_zero and first_non_zero.