PySpark - TypeError: 'Column' object is not callable - pyspark

I am trying to execute below PySpark code and getting the error
TypeError: 'Column' object is not callable..
Can somebody help me to understand what is wrong with my code?
from pyspark.sql import functions as F
from pyspark.sql.window import *
from pyspark.sql.functions import lead, add_months, trunc, max
appName = "Example"
master = "local[*]"
spark = (
SparkSession
.builder
.appName(appName)
.master(master)
.enableHiveSupport()
.getOrCreate()
)
df = spark.read.table("user.test")
generate_months = (
df
.select(
"DP_AD_ACCT_NBR",
"DP_AD_CCY_CDE",
"DP_AD_CURR_BAL",
"DP_AD_LST_MDFSN_DATE",
F.lead("DP_AD_LST_MDFSN_DATE", 1, F.current_date()).over(
Window.partitionBy("DP_AD_ACCT_NBR","DP_AD_CCY_CDE")
.orderBy ("DP_AD_LST_MDFSN_DATE")
).alias("next_date")
)
)

The issue here is with F.lead() call. Third parameter (default value) is not of Column type, but this is just some constant value. If you want to use Column for default value use coalesce():
F.coalesce(
F.lead("DP_AD_LST_MDFSN_DATE").over(
Window.partitionBy("DP_AD_ACCT_NBR","DP_AD_CCY_CDE")
.orderBy ("DP_AD_LST_MDFSN_DATE")
),
F.current_date()
).alias("next_date")
Offset default value is 1 so you can omit it.

Related

Azure Databricks analyze if the columns names are lower case, using islower() function

This is my logic on pyspark:
df2 = spark.sql(f" SELECT tbl_name, column_name, data_type, current_count FROM {database_name}.{tablename}")
query_df = spark.sql(f"SELECT tbl_name, COUNT(column_name) as `num_cols` FROM {database_name}.{tablename} GROUP BY tbl_name")
df_join = df2.join(query_df,['tbl_name'])
Then I want to add to the Dataframe another column called 'column_case_lower' with the analyzes if the columns_names are lower case using islower() function.
I'm using this logic to do the analyzes:
df_join.withColumn("column_case_lower",
when((col("column_name").islower()) == 'true'.otherwise('false'))
-- The error is: unexpected EOF while parsing
expecting something like this:
islower() cant be applied on column type. Use the below code that uses UDF instead.
def checkCase(col_value):
return col_value.islower()
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
checkUDF = udf(lambda z: checkCase(z),StringType())
from pyspark.sql.functions import col,when
df.withColumn("new_col", when(checkUDF(col('column_name')) == True,"True")
.otherwise("False")).show()

Writing data from kafka to hive using pyspark - stucked

I quite new to spark and started with pyspark, I am learning to push data from kafka to hive using pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.streaming.kafka import KafkaUtils
from os.path import abspath
warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("sparkstreaming").getOrCreate()
df = spark.read.format("kafka").option("startingoffsets", "earliest").option("kafka.bootstrap.servers", "kafka-server1:66,kafka-server2:66").option("kafka.security.protocol", "SSL").option("kafka.ssl.keystore.location", "mykeystore.jks").option("kafka.ssl.keystore.password","mykeystorepassword").option("subscribe","json_stream").load().selectExpr("CAST(value AS STRING)")
json_schema = df.schema
df1 = df.select($"value").select(from_json,json_schema).alias("data").select("data.*")
The above is not working, however after extracting data, I want to insert data to hive table.
As I am completely new, looking for help.
Appreciated in advance! :)
from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

create a column with range of values between values in other two columns in pyspark

I have two columns A(year1) and B(year2) in spark. I need to create a column C which has to contain an array of years between year 1 and year 2 .
suppose A - 1990 & B - 1993
o/p C - should be [1990,1990,1991,1991,1992,1992,1993,1993]
could anyone come up with a solution (spark) with out using udf
You could try, assume df contains year1 and year2.
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.getOrCreate()
years = spark.range(2020).withColumnRenamed('id', 'year')
df = (
df
.withColumn(
‘id’,
F. monotonically_increasing_id()
) # EDIT: There was a missing bracket here
.join(
years,
F.col(‘year’).between(‘year1’, ‘year2’),
)
.groupBy(
‘id’
)
.agg(
F.collect_list(‘year’).alias(‘years’)
)
)
Let me know it this doesn't work.

streamWrite with append option and window function

I'm trying to writeStream using the append option, but I get an error.
Code:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window
from pyspark.sql.functions import col, column, count, when
spark = SparkSession\
.builder\
.appName("get_sensor_data")\
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
Sensor = lines.select(lines.value.alias('Sensor'),
lines.timestamp)
windowedCounts = Sensor.withWatermark('timestamp', '10 seconds').groupBy(
window(Sensor.timestamp, windowDuration, slideDuration)).\
agg(count(when(col('Sensor')=="LR1 On",True)).alias('LR1'),\
count(when(col('Sensor')=="LR2 On",True)).alias('LR2'),\
count(when(col('Sensor')=="LD On",True)).alias('LD')).\
orderBy('window')
query = windowedCounts\
.writeStream\
.outputMode('append')\
.format("console")\
.start()
Error:
Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark
The reason for using the append option is to save as a CSV file later.
I think this problem is caused by the window function, but I don't know how to solve it.

sc is not defined while running executable python code

I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")
In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.