Access home directory within spark task node - pyspark

Directory: /home/hadoop/
module.py
def incr(value):
return int(value + 1)
main.py
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
import sys
sys.path.append('/home/hadoop/')
import module
if __name__ == '__main__':
df = spark.createDataFrame([['a', 1], ['b', 2]], schema=['id', 'value'])
df.show()
print(module.incr(5)) #this works
# this throws module not found error
incr_udf = F.udf(lambda val: module.incr(val), T.IntegerType())
df = df.withColumn('new_value', incr_udf('value'))
df.show()
Spark task nodes do not have access to /home/hadoop/
How do I import module.py from within spark task nodes?

if you are submitting the spark to yarn. the task will be progress launched by user 'yarn' in the worknode and will not have permission to access.
you can add --py-files module.py to your spark-submit command, then you want directly call the function module.py by adding from module import * since they are all in the container now

Related

emr-container pyspark job running indefinitely

Here's my Python script:
import calendar
import pydeequ
import boto3
import psycopg2
import os
import pyspark
from py4j import *
from pyspark.sql import SparkSession,Row
from pydeequ.profiles import *
from pydeequ.suggestions import *
from pydeequ.repository import *
from pydeequ.analyzers import *
from pyspark.sql import SparkSession
from botocore.config import Config
from datetime import datetime,timedelta,date
from pyspark.conf import SparkConf
from pydeequ.checks import *
from pydeequ.verification import *
from py4j.java_gateway import java_import
print(os.system("""pyspark --version"""))
spark = (SparkSession.builder \
.appName('run_dq_for_xpertrak_pathtrak') \
.enableHiveSupport() \
.config(conf=SparkConf()) \
.config("spark.jars.packages", pydeequ.deequ_maven_coord) \
.config("spark.jars.excludes", pydeequ.f2j_maven_coord) \
.getOrCreate())
java_import(spark._sc._jvm, "org.apache.spark.sql.*")
print('here---')
print(spark)
junk = spark.sql("""SELECT * FROM xpertrak.pathtrak LIMIT 10""")
print(junk)
Within AWS emr-containers (i.e. EMR on EKS), this job successfully runs and UI shows that indeed the job completed. However, when I include or append the following lines of code to the bottom of script above, the job technically completes (based on simple logs prints) , but the UI never changes from the running state...
print('checking')
check = Check(spark, level=CheckLevel.Warning, description="Data Validation Check")
checkResult = VerificationSuite(spark) \
.onData(junk) \
.addCheck(
check.hasSize(lambda x: x >= 5000000)
).run()
print(checkResult)
print('check')
This is what that looks like the AWS console/UI:
What could be causing this anomaly?
Based on AWS-supplied docs from here, adding the following ended the job successfully:
spark.sparkContext._gateway.shutdown_callback_server()
spark.stop()

dataframe.py", in __getattr__ AttributeError: 'DataFrame' object has no attribute 'index'

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
from pandas import DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Optimize BigQuery Storage') \
.config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.24.2') \
.getOrCreate()
df = spark.read.text('gs://test111/test/test.txt', lineSep=",")
#df.selectExpr("split(value, ',') as\
#Text_Data_In_Rows_Using_Text").show(4,False)
df.show()
sProjectID = 'prj-d-xxxx-ingest'
sTargetDataset = 'data'
sTargetTable = 'xxx'
client = bigquery.Client()
table_id = 'data.xxx'
# Since string columns use the "object" dtype, pass in a (partial) schema
# to ensure the correct BigQuery data type.
job_config = bigquery.LoadJobConfig(schema=[
bigquery.SchemaField("Source_Code", "STRING"),
bigquery.SchemaField("FLAG", "STRING")
])
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
ERROR as below
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1643, in getattr
AttributeError: 'DataFrame' object has no attribute 'index'
Instead of using google.cloud.bigquery, you can use BigQuery connector with spark to write data to BQ. See BQ connector with spark document.
Assuming that your data is already correct on your df variable. You can apply this code to write the data to BQ:
gcs_bucket="your-gcs-bucket" #If not provided, it will create a temporary bucket for this
df.write.format("bigquery").option("table","dataset.your_table_here").option("temporaryGcsBucket", gcs_bucket).mode("append").save()
#.mode() can be either 'append' or 'overwrite'
I executed the code using spark-submit with Spark BigQuery jar:
spark-submit --jars gs://spark-lib/bigquery/spark-bigquery-latest.jar my_script.py
See logs after running the script:

Load a file to couchbase using spark

I haven't found any solution clear for loading a file e into Couchbase using spark
I am having a file huge file with lot of records similar to this
ID|Content
prd_lct:11118:3|{"type":"prd_lct","lct_nbr":118,"itm_nbr":3,"locations":[{"lct_s12_id":1,"prd_121_typ_cd":1,"fxt_ail_id":"45","fxt_bay_id":"121","lvl_txt":"2"}],"itemDetails":[{"pmy_vbu_nbr":null,"upc_id":"1212121","vnd_mod_id":"1212121"}]}
My code
spark-shell --packages com.couchbase.client:spark-connector_2.11:2.2.0 --conf spark.couchbase.username=username --conf spark.couchbase.password=passrod --conf spark.couchbase.bucket.bucketname="" --conf spark.couchbase.nodes=http://1.2.3.4:18091,http://1.2.3.3:18091,http://1.2.3.5:18091
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.spark._
import com.couchbase.spark.streaming._
import org.apache.spark.sql.{DataFrameReader, SQLContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
val df = spark.read.option("delimiter", "|").option("header", true).csv("/hdfsData/test.doc").toDF()
df.createOrReplaceTempView("TempQuery")
spark.sql("select * from TempQuery").map(pair => { val ID = JsonArray.create()
val content = JsonObject.create().put("ID", ID)
pair._2.map(_.value.getString("Content")).foreach(ID.add)
JsonDocument.create(pair._1, content)
})
.saveToCouchbase()
I know this is wrong , but i just started , new to Scala and Couchbase.
Please let me know your inputs, basically i have the key and value in a file separated by | and I wanted to loaded to the Couchbase

SparkSession and SparkContext initiation in PySpark

I would like to know the PySpark equivalent of the following code in Scala. I am using databricks. I need the same output as below:-
to create new Spark session and output the session id (SparkSession#123d0e8)
val new_spark = spark.newSession()
**Output**
new_spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession#123d0e8
to view SparkContext and output the SparkContext id (SparkContext#2dsdas33)
new_spark.sparkContext
**Output**
org.apache.spark.SparkContext = org.apache.spark.SparkContext#2dsdas33
It's very similar. If you have already a session and want to open another one, you can use
my_session = spark.newSession()
print(my_session)
This will produce the new session object I think you are trying to create
<pyspark.sql.session.SparkSession object at 0x7fc3bae3f550>
spark is a session object already running, because you are using a databricks notebook
SparkSession could be created as http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html
>>> from pyspark.sql import SparkSession
>>> from pyspark.conf import SparkConf
>>> SparkSession.builder.config(conf=SparkConf())
or
>>> from pyspark.sql import SparkSession
>>> spark = SparkSession.builder.appName('FirstSparkApp').getOrCreate()

Submitting a pyspark job to Amazon EMR cluster from terminal

I have SSH-ed into the Amazon EMR server and I want to submit a Spark job ( a simple word count file and a sample.txt are both on the Amazon EMR server ) written in Python from the terminal. How do I do this and what's the syntax?
The word_count.py is as follows:
from pyspark import SparkConf, SparkContext
from operator import add
import sys
## Constants
APP_NAME = " HelloWorld of Big Data"
##OTHER FUNCTIONS/CLASSES
def main(sc,filename):
textRDD = sc.textFile(filename)
words = textRDD.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1))
wordcount = words.reduceByKey(add).collect()
for wc in wordcount:
print (wc[0],wc[1])
if __name__ == "__main__":
# Configure Spark
conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster("local[*]")
sc = SparkContext(conf=conf)
sc._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId","XXXX")
sc._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey","YYYY")
filename = "s3a://bucket_name/sample.txt"
# filename = sys.argv[1]
# Execute Main functionality
main(sc, filename)
You can run this command:
spark-submit s3://your_bucket/your_program.py
if you need to run the script using python3, you can run this command before spark-submit:
export PYSPARK_PYTHON=python3.6
Remember to save your program in a bucket before spark-submit.