emr-container pyspark job running indefinitely - pyspark

Here's my Python script:
import calendar
import pydeequ
import boto3
import psycopg2
import os
import pyspark
from py4j import *
from pyspark.sql import SparkSession,Row
from pydeequ.profiles import *
from pydeequ.suggestions import *
from pydeequ.repository import *
from pydeequ.analyzers import *
from pyspark.sql import SparkSession
from botocore.config import Config
from datetime import datetime,timedelta,date
from pyspark.conf import SparkConf
from pydeequ.checks import *
from pydeequ.verification import *
from py4j.java_gateway import java_import
print(os.system("""pyspark --version"""))
spark = (SparkSession.builder \
.appName('run_dq_for_xpertrak_pathtrak') \
.enableHiveSupport() \
.config(conf=SparkConf()) \
.config("spark.jars.packages", pydeequ.deequ_maven_coord) \
.config("spark.jars.excludes", pydeequ.f2j_maven_coord) \
.getOrCreate())
java_import(spark._sc._jvm, "org.apache.spark.sql.*")
print('here---')
print(spark)
junk = spark.sql("""SELECT * FROM xpertrak.pathtrak LIMIT 10""")
print(junk)
Within AWS emr-containers (i.e. EMR on EKS), this job successfully runs and UI shows that indeed the job completed. However, when I include or append the following lines of code to the bottom of script above, the job technically completes (based on simple logs prints) , but the UI never changes from the running state...
print('checking')
check = Check(spark, level=CheckLevel.Warning, description="Data Validation Check")
checkResult = VerificationSuite(spark) \
.onData(junk) \
.addCheck(
check.hasSize(lambda x: x >= 5000000)
).run()
print(checkResult)
print('check')
This is what that looks like the AWS console/UI:
What could be causing this anomaly?

Based on AWS-supplied docs from here, adding the following ended the job successfully:
spark.sparkContext._gateway.shutdown_callback_server()
spark.stop()

Related

Access home directory within spark task node

Directory: /home/hadoop/
module.py
def incr(value):
return int(value + 1)
main.py
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
import sys
sys.path.append('/home/hadoop/')
import module
if __name__ == '__main__':
df = spark.createDataFrame([['a', 1], ['b', 2]], schema=['id', 'value'])
df.show()
print(module.incr(5)) #this works
# this throws module not found error
incr_udf = F.udf(lambda val: module.incr(val), T.IntegerType())
df = df.withColumn('new_value', incr_udf('value'))
df.show()
Spark task nodes do not have access to /home/hadoop/
How do I import module.py from within spark task nodes?
if you are submitting the spark to yarn. the task will be progress launched by user 'yarn' in the worknode and will not have permission to access.
you can add --py-files module.py to your spark-submit command, then you want directly call the function module.py by adding from module import * since they are all in the container now

Fail to savetoMongoDB :java.lang.ClassNotFoundException: com.mongodb.hadoop.io.BSONWritable

I want to convert data from Dataframe to RDD, and save it to MongoDB, here is my code:
import pymongo
import pymongo_spark
from pyspark import SparkConf, SparkContext
from pyspark import BasicProfiler
from pyspark.sql import SparkSession
class MyCustomProfiler(BasicProfiler):
def show(self, id):
print("My custom profiles for RDD:%s" % id)
conf = SparkConf().set("spark.python.profile", "true")
spark = SparkSession.builder \
.master("local[*]") \
.appName("Word Count") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# Important: activate pymongo_spark.
pymongo_spark.activate()
on_time_dataframe = spark.read.parquet(r'\data\on_time_performance.parquet')
on_time_dataframe.show()
# Note we have to convert the row to a dict to avoid https://jira.mongodb.org/browse/HADOOP-276
as_dict = on_time_dataframe.rdd.map(lambda row: row.asDict())
as_dict.saveToMongoDB('mongodb://localhost:27017/agile_data_science.on_time_performance')
some errors occurs:
py4j.protocol.Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.saveAsNewAPIHadoopFile.
: java.lang.ClassNotFoundException: com.mongodb.hadoop.io.BSONWritable
I have installed the Mongo-hadoop file; It seems I don't have a Bsonweitable class. I'm not good at java, So I want someone to help me.

Load a file to couchbase using spark

I haven't found any solution clear for loading a file e into Couchbase using spark
I am having a file huge file with lot of records similar to this
ID|Content
prd_lct:11118:3|{"type":"prd_lct","lct_nbr":118,"itm_nbr":3,"locations":[{"lct_s12_id":1,"prd_121_typ_cd":1,"fxt_ail_id":"45","fxt_bay_id":"121","lvl_txt":"2"}],"itemDetails":[{"pmy_vbu_nbr":null,"upc_id":"1212121","vnd_mod_id":"1212121"}]}
My code
spark-shell --packages com.couchbase.client:spark-connector_2.11:2.2.0 --conf spark.couchbase.username=username --conf spark.couchbase.password=passrod --conf spark.couchbase.bucket.bucketname="" --conf spark.couchbase.nodes=http://1.2.3.4:18091,http://1.2.3.3:18091,http://1.2.3.5:18091
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.spark._
import com.couchbase.spark.streaming._
import org.apache.spark.sql.{DataFrameReader, SQLContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
val df = spark.read.option("delimiter", "|").option("header", true).csv("/hdfsData/test.doc").toDF()
df.createOrReplaceTempView("TempQuery")
spark.sql("select * from TempQuery").map(pair => { val ID = JsonArray.create()
val content = JsonObject.create().put("ID", ID)
pair._2.map(_.value.getString("Content")).foreach(ID.add)
JsonDocument.create(pair._1, content)
})
.saveToCouchbase()
I know this is wrong , but i just started , new to Scala and Couchbase.
Please let me know your inputs, basically i have the key and value in a file separated by | and I wanted to loaded to the Couchbase

cannot pickle pyspark dataframe

I want to create a decision tree model using spark submit.
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
dt = df.rdd.map(createLabeledPoints)
model_dt = DecisionTree.trainClassifier(dt, numClasses=467, categoricalFeaturesInfo={0:2,1:2, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 8:2, 9:2, 10:2, 11:2, 12:2, 13:2, 14:2, 15:2, 16:2, 17:2, 18:2, 19:2, 20:2, 21:2, 22:2, 23:2, 24:2, 25:2, 26:2, 27:2, 28:2, 29:2, 30:2, 31:2, 32:2, 33:2, 34:2, 35:2, 36:2, 37:2, 38:2}, impurity='gini', maxDepth=30, maxBins=32)
where createLabeledPoints is a function that return to me a labeledpoint
I have no issue when I execute this code using pyspark in the spark-shell
but I want to use spark-submit, when I do that its gives me this error
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
I think the problem is because I create another sparkSession inside spark-submit (I think) or because pysparksataframe cannot be pickled!
Can anyone please help me !

Spark.sql and sqlContext.sql

I have imported the below modules. I tried to load data from sqlCtx.read.format, I am getting "IllegalArgumentException: u"Error while instantiating 'org.apache.spark.sql.hive.HiveSessionState':"" error, but it works well when I use spark.read.format. I am seeing same behavior when I am retrieving data from registered temptable/view. What can I add extra to use sqlCtx.sql instead of spark.sql?
import os
import sys
import pandas as pd
import odbc as pyodbc
import os
import sys
import re
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyspark.sql.functions as func
import matplotlib.patches as mpatches
import time as time
from matplotlib.patches import Rectangle
import datetime
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("AppName")
sqlCtx = SQLContext(sc)
I spent two hours of my life in this one, just to realize I did not need:
sqlCtx = SQLContext(sc)
Just using SQLContext.read.(...), solved this in my case.