Append aggregate stats from loop in Pyspark table - pyspark

I have the following Pyspark code. In each iteration in the loop, I filter out all rows with a particular string in column H. Then I calculate some aggregate stats on column G (results in 3 values). I want to save all the aggregate counts in a single table (rows: CM, NCM, FP; and columns: POP, POP N, POP SN, POP QP).
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
spark = SparkSession.builder.master("local").appName("Word Count").config("spark.some.config.option", "some-value").getOrCreate()
np.random.seed(0)
rows = 1000
df_pandas = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(rows)),dtype='float32'),
'D' : np.array([3] * rows,dtype='int32'),
'E' : pd.Categorical(np.random.choice(["test","train","frog", "chicken"], rows)),
'F' : 'foo' ,
'G' : np.random.choice(['CM', 'NCM', 'FP'], rows),
'H' : np.random.choice(['POP', 'POP N', 'POP SN', 'POP QP'], rows)})
df_spark = spark.createDataFrame(df_pandas)
blocks = ['POP', 'POP N', 'POP SN', 'POP QP']
for block in blocks:
df_spark_trimmed = df_spark.filter(~F.col('H').isin(block))
counts = df_spark_trimmed.groupby('G').count()
counts.show()

Use join:
import pyspark.sql.functions as F
count_by_g = df_spark.groupBy('G').agg(F.count('*').alias('CountByG'))
count_by_gh = df_spark.groupBy(['G', 'H']).agg(F.count('*').alias('CountByGH'))
count_by_g.join(count_by_gh, ['G']).selectExpr(
'G', 'H', 'CountByG - CountByGH as count'
).groupBy('G').pivot('H').agg(F.max('count').alias('count')).show()
+---+---+-----+------+------+
| G|POP|POP N|POP QP|POP SN|
+---+---+-----+------+------+
| CM|256| 260| 245| 250|
|NCM|265| 254| 248| 262|
| FP|246| 236| 239| 239|
+---+---+-----+------+------+
Or another solution with window functions:
df_spark.groupBy(['G', 'H']).count().selectExpr(
'G', 'H', 'sum(count) over (partition by G) - count as count'
).groupBy('G').pivot('H').agg(F.max('count').alias('count')).show()
+---+---+-----+------+------+
| G|POP|POP N|POP QP|POP SN|
+---+---+-----+------+------+
| CM|256| 260| 245| 250|
|NCM|265| 254| 248| 262|
| FP|246| 236| 239| 239|
+---+---+-----+------+------+

Related

How to combine UDFs when creating a new column in Pyspark 1.6

I am trying to aggregate a table that I have around one kay value (id here) so that I can have one row per id and perform some verifications on the rows that belong to each id in order to identify the 'result' (type of transaction of sorts). Lets say that after aggregating, I have something like this:
sc = SparkContext()
cols = ['id', 'list1', 'list2']
data = [('zero', ['cd1', 'cd7', 'cd5', 'cd2'], ['', '', '', 'debit']),('one', ['cd2', 'cd3', 'cd9', 'cd6'], ['credit', '', '', '']),('two', ['cd4', 'cd3', 'cd5', 'cd1'],['', '', '', ''])]
rdd = sc.parallelize(data)
df = rdd.toDF(cols)
>>> df.show()
+----+--------------------+--------------+
| id| list1| list2|
+----+--------------------+--------------+
|zero|[cd1, cd7, cd5, cd2]| [, , , debit]|
| one|[cd2, cd3, cd9, cd6]|[credit, , , ]|
| two|[cd4, cd3, cd5, cd1]| [, , , ]|
+----+--------------------+--------------+
The question I have to answer here is: does list1 have cd9 in it? If so, what is the corresponding value in list2 of list1's cd2?
What I have done to solve it was defining a couple of UDFs, since array functions in PySpark 1.6 are limited:
enum = F.udf(lambda x,y: [i for i, e in enumerate(x) if e==y], T.ArrayType(T.IntegerType()))
elat = F.udf(lambda x,y: [e for i, e in enumerate(x) if (i in y)], T.ArrayType(T.StringType()))
nulls = F.udf(lambda: [], T.ArrayType(T.IntegerType()))
Then creating a new 'lookup' column with the indexes of the elements I want to grab from the other column of lists:
df = df.withColumn('lookup',
F.when((F.array_contains(F.col('list1'), 'cd7')) | (F.array_contains(F.col('list1'), 'cd9')), enum(F.col('list1'), F.lit('cd2')))
.otherwise(nulls()))
And finally using this column to reach my endgoal:
df = df.withColumn('result',
F.when(F.array_contains(F.col('list1'), 'cd7') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'debit')), 'CD 7 - DEBIT')
.otherwise(F.when(F.array_contains(F.col('list1'), 'cd7') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'credit')), 'CD 7 - CREDIT')
.otherwise(F.when(F.array_contains(F.col('list1'), 'cd9') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'debit')), 'CD 9 - DEBIT')
.otherwise(F.when(F.array_contains(F.col('list1'), 'cd9') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'credit')), 'CD 9 - CREDIT')
.otherwise('etc')
)))
)
>>> df.show()
+----+--------------------+--------------+------+-------------+
| id| list1| list2|lookup| result|
+----+--------------------+--------------+------+-------------+
|zero|[cd1, cd7, cd5, cd2]| [, , , debit]| [3]| CD 7 - DEBIT|
| one|[cd2, cd3, cd9, cd6]|[credit, , , ]| [0]|CD 9 - CREDIT|
| two|[cd4, cd3, cd5, cd1]| [, , , ]| []| etc|
+----+--------------------+--------------+------+-------------+
But I would very much prefer if there was a way to achieve the same without creating one extra column, because the actual dataframe has more columns and the lookup list may need to access different columns depending on the rule that I need to check for. When I tried to combine both elat and enum UDFs on one go it was unable to compute one or the other.

pyspark create a structType with conditions in a dateframe

I have this dataframe
Item value
A 1
B 1
C 2
D 2
I want to get this
Item value
{A,B} 1
{C,D} 2
Something like groupby 'value' and combine 'item
'
Your DF:
df = sqlContext.createDataFrame(
[
('A', 1)
,('B', 1)
,('C', 2)
,('D', 2)
]
,['Item', 'value']
)
Steps are:
create a dummy matrix
apply a lambda function
map the list returned by the lambda function to the column names list
from pyspark.sql import functions as F
def map_func(list):
z = []
for index, elem in enumerate(list):
if elem == 1:
z.append(cols[index])
return z
df = df.groupBy("value").pivot("Item").agg(F.lit(1)).fillna(0)
cols = df.columns[1:]
map_func_lamb = F.udf(lambda row: map_func(row))
df = df.withColumn("list", map_func_lamb(F.struct([df[x] for x in df.columns[1:]])))\
.select(F.col("list").alias("Item"), "value")
df.show()
+------+-----+
| Item|value|
+------+-----+
|[A, B]| 1|
|[C, D]| 2|
+------+-----+
Not sure if you really want a dic as a result or a list is fine, but I don't see much difference at this point

pyspark, get rows where first column value equals id and second column value is between two values, do this for each row in a dataframe

So I have one pyspark dataframe like so, let's call it dataframe a:
+-------------------+---------------+----------------+
| reg| val1| val2 |
+-------------------+---------------+----------------+
| N110WA| 1590030660| 1590038340000|
| N876LF| 1590037200| 1590038880000|
| N135MH| 1590039060| 1590040080000|
And another like this, let's call it dataframe b:
+-----+-------------+-----+-----+---------+----------+---+----+
| reg| postime| alt| galt| lat| long|spd| vsi|
+-----+-------------+-----+-----+---------+----------+---+----+
|XY679|1590070078549| 50| 130|18.567169|-69.986343|132|1152|
|HI949|1590070091707| 375| 455| 18.5594|-69.987804|148|1344|
|JX784|1590070110666| 825| 905|18.544968|-69.990414|170|1216|
Is there some way to create a numpy array or pyspark dataframe, where for each row in dataframe a, all the rows in dataframe b with the same reg and postime between val 1 and val 2, are included?
You can try the below solution -- and let us know if works or anything else is expected ?
I have modified the imputes a little in order to showcase the working solution--
Input here
from pyspark.sql import functions as F
df_a = spark.createDataFrame([('N110WA',1590030660,1590038340000), ('N110WA',1590070078549,1590070078559)],[ "reg","val1","val2"])
df_b = spark.createDataFrame([('N110WA',1590070078549)],[ "reg","postime"])
df_a.show()
df_a
+------+-------------+-------------+
| reg| val1| val2|
+------+-------------+-------------+
|N110WA| 1590030660|1590038340000|
|N110WA|1590070078549|1590070078559|
+------+-------------+-------------+
df_b
+------+-------------+
| reg| postime|
+------+-------------+
|N110WA|1590070078549|
+------+-------------+
Solution here
from pyspark.sql import types as T
from pyspark.sql import functions as F
#df_a = df_a.join(df_b,'reg','left')
df_a = df_a.withColumn('condition_col', F.when(((F.col('postime') >= F.col('val1')) & (F.col('postime') <= F.col('val2'))),'1').otherwise('0'))
df_a = df_a.filter(F.col('condition_col') == 1).drop('condition_col')
df_a.show()
Final Output
+------+-------------+-------------+-------------+
| reg| val1| val2| postime|
+------+-------------+-------------+-------------+
|N110WA|1590070078549|1590070078559|1590070078549|
+------+-------------+-------------+-------------+
Yes, assuming df_a and df_b are both pyspark dataframes, you can use an inner join in pyspark:
delta = val
df = df_a.join(df_b, [
df_a.res == df_b.res,
df_a.posttime <= df_b.val1 + delta,
df_a.posttime >= df_b.val2 - delta
], "inner")
Will filter out the results to only include the ones specified

Validate data from the same column in different rows with pyspark

How can I change the value of a column depending on some validation between some cells? What I need is to compare the kilometraje values of each customer's (id) record to compare whether the record that follows the kilometraje is higher.
fecha id estado id_cliente error_code kilometraje error_km
1/1/2019 1 A 1 10
2/1/2019 2 A ERROR 20
3/1/2019 1 D 1 ERROR 30
4/1/2019 2 O ERROR
The error in the error_km column is because for customer (id) 2 the kilometraje value is less than the same customer record for 2/1/2019 (If time passes the car is used so the kilometraje increases, so that there is no error the mileage has to be higher or the same)
I know that withColumn I can overwrite or create a column that doesn't exist and that using when I can set conditions. For example: This would be the code I use to validate the estado and id_cliente column and ERROR overwrite the error_code column where applicable, but I don't understand how to validate between different rows for the same client.
from pyspark.sql.functions import lit
from pyspark.sql import functions as F
from pyspark.sql.functions import col
file_path = 'archive.txt'
error = 'ERROR'
df = spark.read.parquet(file_path)
df = df.persist(StorageLevel.MEMORY_AND_DISK)
df = df.select('estado', 'id_cliente')
df = df.withColumn("error_code", lit(''))
df = df.withColumn('error_code',
F.when((F.col('status') == 'O') &
(F.col('client_id') != '') |
(F.col('status') == 'D') &
(F.col('client_id') != '') |
(F.col('status') == 'A') &
(F.col('client_id') == ''),
F.concat(F.col("error_code"), F.lit(":[{}]".format(error)))
)
.otherwise(F.col('error_code')))
You achieve that with the lag window function. The lag function returns you the row before the current row. With that you can easily compare the kilometraje values. Have a look at the code below:
import pyspark.sql.functions as F
from pyspark.sql import Window
l = [('1/1/2019' , 1 , 10),
('2/1/2019', 2 , 20 ),
('3/1/2019', 1 , 30 ),
('4/1/2019', 1 , 10 ),
('5/1/2019', 1 , 30 ),
('7/1/2019', 3 , 30 ),
('4/1/2019', 2 , 5)]
columns = ['fecha', 'id', 'kilometraje']
df=spark.createDataFrame(l, columns)
df = df.withColumn('fecha',F.to_date(df.fecha, 'dd/MM/yyyy'))
w = Window.partitionBy('id').orderBy('fecha')
df = df.withColumn('error_km', F.when(F.lag('kilometraje').over(w) > df.kilometraje, F.lit('ERROR') ).otherwise(F.lit('')))
df.show()
Output:
+----------+---+-----------+--------+
| fecha| id|kilometraje|error_km|
+----------+---+-----------+--------+
|2019-01-01| 1| 10| |
|2019-01-03| 1| 30| |
|2019-01-04| 1| 10| ERROR|
|2019-01-05| 1| 30| |
|2019-01-07| 3| 30| |
|2019-01-02| 2| 20| |
|2019-01-04| 2| 5| ERROR|
+----------+---+-----------+--------+
The fourth row doesn't get labeled with 'ERROR' as the previous value had a smaller kilometraje value (10 < 30). When you want to label all the id's with 'ERROR' which contain at least one corrupted row, perform a left join.
df.drop('error_km').join(df.filter(df.error_km == 'ERROR').groupby('id').agg(F.first(df.error_km).alias('error_km')), 'id', 'left').show()
I use .rangeBetween(Window.unboundedPreceding,0).
This function searches from the current value for the added value for the back
import pyspark
from pyspark.sql.functions import lit
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql import Window
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.getOrCreate()
error = 'This is error'
l = [('1/1/2019' , 1 , 10),
('2/1/2019', 2 , 20 ),
('3/1/2019', 1 , 30 ),
('4/1/2019', 1 , 10 ),
('5/1/2019', 1 , 22 ),
('7/1/2019', 1 , 23 ),
('22/1/2019', 2 , 5),
('11/1/2019', 2 , 24),
('13/2/2019', 1 , 16),
('14/2/2019', 2 , 18),
('5/2/2019', 1 , 19),
('6/2/2019', 2 , 23),
('7/2/2019', 1 , 14),
('8/3/2019', 1 , 50),
('8/3/2019', 2 , 50)]
columns = ['date', 'vin', 'mileage']
df=spark.createDataFrame(l, columns)
df = df.withColumn('date',F.to_date(df.date, 'dd/MM/yyyy'))
df = df.withColumn("max", lit(0))
df = df.withColumn("error_code", lit(''))
w = Window.partitionBy('vin').orderBy('date').rangeBetween(Window.unboundedPreceding,0)
df = df.withColumn('max',F.max('mileage').over(w))
df = df.withColumn('error_code', F.when(F.col('mileage') < F.col('max'), F.lit('ERROR')).otherwise(F.lit('')))
df.show()
Finally, all that remains is to remove the column that has the maximum
df = df.drop('max')
df.show()

How to replace leading 0 with 91 using regex in pyspark dataframe

In python I am doing this to replace leading 0 in column phone with 91.
But how to do it in pyspark.
con dataframe is :
id phone1
1 088976854667
2 089706790002
Outptut i want is
1 9188976854667
2 9189706790002
# Replace leading Zeros in a phone number with 91
con.filter(regex='[_]').replace('^0','385',regex=True)
You are looking for the regexp_replace function. This function takes 3 parameter:
column name
pattern
repleacement
from pyspark.sql import functions as F
columns = ['id', 'phone1']
vals = [(1, '088976854667'),(2, '089706790002' )]
df = spark.createDataFrame(vals, columns)
df = df.withColumn('phone1', F.regexp_replace('phone1',"^0", "91"))
df.show()
Output:
+---+-------------+
| id| phone1|
+---+-------------+
| 1|9188976854667|
| 2|9189706790002|
+---+-------------+