How to pass variable to UDAF (user define aggregation function) - pyspark

import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import PandasUDFType, pandas_udf
from pyspark.sql.types import *
import os
#pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
def split(df, validation_period):
""Logic""
return df
def train_test_split(spark, data_frame, request_json_data):
data_frame = spark.createDataFrame(data_frame)
print(data_frame.schema)
validation_period = request_json_data['validation_period']
groupby_key = request_json_data['groupby_key']
data_frame.groupby(groupby_key).apply(split, validation_period).show()
Can't call split function it gives error. apply() takes 2 positional arguments but 3 were given. I want to pass validation_period as argument to split function.

Short answer: you can't pass extra argument to pandas grouped map udf since it excepts only a single pandas df as an argument.
Long answer: There are other ways you can pass validation_period to the function
use some form of a closure
def split_fabric(validation_period):
#pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
def split(df):
""Logic""
return df
pass it as a column
data_frame \
.withColumn("validation_period", F.lit(validation_period)) \
.groupby(groupby_key).apply(split, validation_period).show()

Related

How to convert enumrate pyspark code into scala code

Below is the pyspark code for matrix multiplication. I need same code logic in scala for matrix multiplication as this logic is good for large volume dataset.
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from functools import reduce
df = spark.sql("select * from tablename")
colDFs = []
for c2 in df.columns:
colDFs.append( df.select( [ F.sum(df[c1]*df[c2]).alias("op_{0}".format(i)) for i,c1 in enumerate(df.columns) ] ) )
mtx = reduce(lambda a,b: a.select(a.columns).union(b.select(a.columns)), colDFs )
mtx.show()
for enumerate you can use zipWithIndex as in df.columns.zipWithIndex
I didn't test it but overall code should be something like
val colsDf=df.columns.flatMap{ case c =>
df.columns.zipWithIndex.map{ case (c2,i) =>
df.select(sum(col(c)*col(c2).alias(s"op_$i")))
}
}
colsDf.reduce((a,b)=>a.select(a.columns.map(col):_*).union(b.select(b.columns.map(col):_*)))

pyspark groupby mean using dictionary variable

I am trying to execute groupby mean of pyspark dataframe using mean function as dictionary variable.
from pyspark.sql import functions as F
_func= {'mean' : F.mean}
df.groupby('name')._func['mean']()
But this fails with error
'AttributeError GroupedData object has no attribute _func'
I tried import mean function of GroupedData class from pyspark.sql.group too but it fails with same error.
How Can I fix this error ?
You need to pass the dictionary in agg.
df = df.groupby('name').agg({'column_name': 'mean'})
If you want to use a dictionary of functions, use it like this,
from pyspark.sql import functions as F
_func= {'mean' : F.mean}
df = df.groupby('name').agg(_f['mean']())
EDIT:
According to your requirements as mentioned in comments, this the only solution I could come up with,
df = df.groupby('name').agg(*[_f['mean'](x) for x in df.columns])
cols_to_delete = [_c for _c in df.columns if df.where(F.col(_c).isNotNull()).count() == 0]
df = df.drop(*cols_to_delete)

Pyspark Logistic Regression, accessing probabilities [duplicate]

I have a dataframe df with a VectorUDT column named features. How do I get an element of the column, say first element?
I've tried doing the following
from pyspark.sql.functions import udf
first_elem_udf = udf(lambda row: row.values[0])
df.select(first_elem_udf(df.features)).show()
but I get a net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict(for numpy.dtype) error. Same error if I do first_elem_udf = first_elem_udf(lambda row: row.toArray()[0]) instead.
I also tried explode() but I get an error because it requires an array or map type.
This should be a common operation, I think.
Convert output to float:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, udf
def ith_(v, i):
try:
return float(v[i])
except ValueError:
return None
ith = udf(ith_, DoubleType())
Example usage:
from pyspark.ml.linalg import Vectors
df = sc.parallelize([
(1, Vectors.dense([1, 2, 3])),
(2, Vectors.sparse(3, [1], [9]))
]).toDF(["id", "features"])
df.select(ith("features", lit(1))).show()
## +-----------------+
## |ith_(features, 1)|
## +-----------------+
## | 2.0|
## | 9.0|
## +-----------------+
Explanation:
Output values have to be reserialized to equivalent Java objects. If you want to access values (beware of SparseVectors) you should use item method:
v.values.item(0)
which return standard Python scalars. Similarly if you want to access all values as a dense structure:
v.toArray().tolist()
If you prefer using spark.sql, you can use the follow custom function 'to_array' to convert the vector to array. Then you can manipulate it as an array.
from pyspark.sql.types import ArrayType, DoubleType
def to_array_(v):
return v.toArray().tolist()
from pyspark.sql import SQLContext
sqlContext=SQLContext(spark.sparkContext, sparkSession=spark, jsqlContext=None)
sqlContext.udf.register("to_array",to_array_, ArrayType(DoubleType()))
example
from pyspark.ml.linalg import Vectors
df = sc.parallelize([
(1, Vectors.dense([1, 2, 3])),
(2, Vectors.sparse(3, [1], [9]))
]).toDF(["id", "features"])
df.createOrReplaceTempView("tb")
spark.sql("""select * , to_array(features)[1] Second from tb """).toPandas()
output
id features Second
0 1 [1.0, 2.0, 3.0] 2.0
1 2 (0.0, 9.0, 0.0) 9.0
I ran into the same problem with not being able to use explode(). One thing you can do is use VectorSlice from the pyspark.ml.feature library. Like so:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row
slicer = VectorSlicer(inputCol="features", outputCol="features_one", indices=[0])
output = slicer.transform(df)
output.select("features", "features_one").show()
For anyone trying to split the probability columns generated after training a PySpark ML model into usable columns. This does not use UDF or numpy. And this will only work for binary classification. Here lr_pred is the dataframe which has the predictions from the Logistic Regression Model.
prob_df1=lr_pred.withColumn("probability",lr_pred["probability"].cast("String"))
prob_df =prob_df1.withColumn('probabilityre',split(regexp_replace("probability", "^\[|\]", ""), ",")[1].cast(DoubleType()))
Since Spark 3.0.0 this can be done without using UDF.
from pyspark.ml.functions import vector_to_array
https://discuss.dizzycoding.com/how-to-split-vector-into-columns-using-pyspark/
Why is Vector[Double] is used in the results? That's not a very nice data type.

Type conversion error from LabeledPoint in pyspark.mllib, for using linear regression model in pyspark.ml

I have the following code for linear regression using pyspark.ml package. However I get this error message for the last line, when the model is being fit:
IllegalArgumentException: u'requirement failed: Column features must
be of type org.apache.spark.ml.linalg.VectorUDT#3bfc3ba7 but was
actually org.apache.spark.mllib.linalg.VectorUDT#f71b0bce.
Does anyone has an idea what is missing?
Is there any replacement in pyspark.ml for LabeledPoint in pyspark.mllib?
from pyspark import SparkContext
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pandas import *
data = sc.textFile("/FileStore/tables/w7baik1x1487076820914/randomTableSmall.csv")
def parsePoint(line):
values = [float(x) for x in line.split(',')]
return LabeledPoint(values[1], [values[0]])
points_df = data.map(parsePoint).toDF()
lr = LinearRegression()
model = lr.fit(points_df, {lr.regParam:0.0})
The problem is that newer versions of spark have a Vector class in linalg module of ml and you do not need to get it from mllib.linalg. Also the newer versions do not accept spark.mllib.linalg.VectorUDT in ml. here is the code that would work for you :
from pyspark import SparkContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
import numpy as np
data = sc.textFile("/FileStore/tables/w7baik1x1487076820914/randomTableSmall.csv")
def parsePoint(line):
values = [float(x) for x in line.split(',')]
return (values[1], Vectors.dense([values[0]]))
points_df = data.map(parsePoint).toDF(['label','features'])
lr = LinearRegression()
model = lr.fit(points_df)
Spark newer versions don't accept spark.mllib.linalg.VectorUDT (you do not need to get it from mllib.linalg).
try to replace
from pyspark.mllib.regression import LabeledPoint
by:
from pyspark.ml.linalg import Vectors

How to do normalization with MinMaxscaler within each group after using group by to a spark dataframe? [duplicate]

I want to scale data with StandardScaler (from pyspark.mllib.feature import StandardScaler), by now I can do it by passing the values of RDD to transform function, but the problem is that I want to preserve the key. is there anyway that I scale my data by preserving its key?
Sample dataset
0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.
0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.
0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,smurf.
Imports
import sys
import os
from collections import OrderedDict
from numpy import array
from math import sqrt
try:
from pyspark import SparkContext, SparkConf
from pyspark.mllib.clustering import KMeans
from pyspark.mllib.feature import StandardScaler
from pyspark.statcounter import StatCounter
print ("Successfully imported Spark Modules")
except ImportError as e:
print ("Can not import Spark Modules", e)
sys.exit(1)
Portion of code
sc = SparkContext(conf=conf)
raw_data = sc.textFile(data_file)
parsed_data = raw_data.map(Parseline)
Parseline function:
def Parseline(line):
line_split = line.split(",")
clean_line_split = [line_split[0]]+line_split[4:-1]
return (line_split[-1], array([float(x) for x in clean_line_split]))
Not exactly a pretty solution but you can adjust my answer to the similar Scala question. Lets start with an example data:
import numpy as np
np.random.seed(323)
keys = ["foo"] * 50 + ["bar"] * 50
values = (
np.vstack([np.repeat(-10, 500), np.repeat(10, 500)]).reshape(100, -1) +
np.random.rand(100, 10)
)
rdd = sc.parallelize(zip(keys, values))
Unfortunately MultivariateStatisticalSummary is just a wrapper around a JVM model and it is not really Python friendly. Luckily with NumPy array we can use standard StatCounter to compute statistics by key:
from pyspark.statcounter import StatCounter
def compute_stats(rdd):
return rdd.aggregateByKey(
StatCounter(), StatCounter.merge, StatCounter.mergeStats
).collectAsMap()
Finally we can map to normalize:
def scale(rdd, stats):
def scale_(kv):
k, v = kv
return (v - stats[k].mean()) / stats[k].stdev()
return rdd.map(scale_)
scaled = scale(rdd, compute_stats(rdd))
scaled.first()
## array([ 1.59879188, -1.66816084, 1.38546532, 1.76122047, 1.48132643,
## 0.01512487, 1.49336769, 0.47765982, -1.04271866, 1.55288814])