pyspark's window functions fn.avg() only output same data - pyspark

Here is my code:
import pandas as pd
from pyspark.sql import SQLContext
import pyspark.sql.functions as fn
from pyspark.sql.functions import isnan, isnull
from pyspark.sql.functions import lit
from pyspark.sql.window import Window
spark= SparkSession.builder.appName(" ").getOrCreate()
file = "D:\project\HistoryData.csv"
lines = pd.read_csv(file)
spark_df=spark.createDataFrame(cc,['id','time','average','max','min'])
temp = Window.partitionBy("time").orderBy("id").rowsBetween(-1, 1)
df = spark_df.withColumn("movingAvg",fn.avg("average").over(temp))
df.show()
But it output this:
It output the same data,and some data is disappear.

Related

Loading data from AWS EMR to Redshift using Glue is very slow

I am trying to load data from AWS EMR(data storage as S3 and glue-catalog for metastore) to Redshift.
import sys
import boto3
from datetime import datetime,date
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.functions import to_date
from pyspark.sql import SQLContext
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.sparkSession
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.sql("Select * from classic_models.orderdetails where insert_date >= '2021-01-01' and insert_date < '2021-01-02' ")
dynamic_df = DynamicFrame.fromDF(new_df, glueContext, "dynamic_df")
redshift_target_table = "classic_models.orderdetails"
pre_actions = f"Truncate table {redshift_target_table};"
redshift_connection_opts = {
"database": "dev",
"dbtable": redshift_target_table,
"aws_iam_role": "arn:aws:iam::*********",
"preactions": pre_actions
}
s3_temp_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
glueContext.write_dynamic_frame.from_jdbc_conf(
frame = dynamic_df,
catalog_connection = "redshift",
connection_options = redshift_connection_opts,
redshift_tmp_dir = "s3:/staging/orderdetails/%s/" % s3_temp_dir #Need change
)
The extract data from Hive is quite fast but loading data to Redshift is taking long. By long I mean if I am loading data for the past 10 days, and a glue job takes 16 min to complete, less than 1 min is to extract data from Hive, rest is only to load data to Redshift.
More than half of the Hive table columns are of String data-type
Is there any other better and faster way to do it?

sc is not defined while running executable python code

I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")
In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.

Pyspark Window Function

I am trying to calculate the row_number on a data-set based on certain column but i am getting the below error
AttributeError: 'module' object has no attribute 'rowNumber'
I am using the below script to get the row number based on MID and ClaimID. Ay thoughts why this is coming up?
from pyspark.sql.functions import first
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql import Row, functions as F
from pyspark.sql.window import Window
import pyspark.sql.functions as func
def Codes(pharmacyCodes):
df_data=pharmacyCodes
(df_data
.select("MID","claimid",
F.rowNumber()
.over(Window
.partitionBy("MID")
.orderBy("MID")
)
.alias("rowNum")
)
.show()
)
I think you're looking for row_number rather than rowNumber. The mixture of camel case and snake case with Pyspark can get confusing.

Converting Scala code to PySpark

I have found the following code for selecting n rows from dataframe grouped by unique_id.
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.row_number
val window = Window.partitionBy("userId").orderBy($"rating".desc)
dataframe.withColumn("r", row_number.over(window)).where($"r" <= n)
I have tried the following:
from pyspark.sql.functions import row_number, desc
from pyspark.sql.window import Window
w = Window.partitionBy(post_tags.EntityID).orderBy(post_tags.Weight)
newdata=post_tags.withColumn("r", row_number.over(w)).where("r" <= 3)
I get the following error:
AttributeError: 'function' object has no attribute 'over'
Please help me on the same.
I found the answer to this:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
window = Window.partitionBy(df['user_id']).orderBy(df['score'].desc())
df.select('*', rank().over(window).alias('rank'))
.filter(col('rank') <= 2)
.show()
Credits to #mtoto for his answer https://stackoverflow.com/a/38398563/5165377

I used some problems in using pyspark on jupyter

The problem that occurs when I load the file。
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Mytest") \
.config("spark.some.config.option","some-value") \
.getOrCreate()
# Load training data
data = spark.read.format("libsvm").load("/test/test.txt")