I'm submitting steps to run in parallel on an emr. I'm doing this by saving python scripts with pyspark code at an s3 path. I then use the boto code below to submit the steps in parallel to the emr. The steps run the code I have saved to s3 in script files. I would like to instead create a function, like the "read_write" example function below, to perform the same operations I'm doing in the saved python scripts. That way I wouldn't have to save a python script file to s3 for every step I'm trying to run in parallel. Is there a way to submit the code I would like to run in the step to the emr with the boto code below, without having to save it in a script file to s3? I've supplied example code similar to what I'm currently using to submit steps to run in parallel below. I've also supplied code similar to what I would like to try to do, creating the "read_write" function in the same code with the boto script and submitting it as a step to run in parallel to the emr.
# test1.py
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark
# from cloudwatch_logs import (log_handler, error_manager)
import boto3
#############
#############
conf = pyspark.SparkConf()
spark = SparkSession.builder \
.appName("test1") \
.config('spark.sql.codegen.wholeStage', False) \
.getOrCreate()
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
df=sqlContext.read.parquet("path1")
df.write.mode("overwrite").parquet("write_path1")
# test2.py
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark
# from cloudwatch_logs import (log_handler, error_manager)
import boto3
#############
#############
conf = pyspark.SparkConf()
spark = SparkSession.builder \
.appName("test2") \
.config('spark.sql.codegen.wholeStage', False) \
.getOrCreate()
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
df=sqlContext.read.parquet("path2")
df.write.mode("overwrite").parquet("write_path2")
# script to submit multiple steps to run in parallel on emr
client = boto3.client('emr', region_name='us-west-2')
response = client.add_job_flow_steps(
JobFlowId='j-xxxxxxxx', # clusterid
Steps=[
{
'Name': 'test1',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
"spark-submit",
"--deploy-mode",
"cluster",
"--master",
"yarn",
"--conf",
"spark.yarn.submit.waitAppCompletion=true",
"s3a://path/test1.py"
]
}
},
{
'Name': 'test2',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
"spark-submit",
"--deploy-mode",
"cluster",
"--master",
"yarn",
"--conf",
"spark.yarn.submit.waitAppCompletion=true",
"s3a://path/test2.py"
]
}
}
]
)
# what I would like to do
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark
# from cloudwatch_logs import (log_handler, error_manager)
import boto3
# function to read and write dataframe
def read_write(read_path,write_path):
df=sqlContext.read.parquet(read_path)
df.write.mode("overwrite").parquet(write_path)
client = boto3.client('emr', region_name='us-west-2')
response = client.add_job_flow_steps(
JobFlowId='j-xxxxxxxx', # clusterid
Steps=[
{
'Name': 'test1',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
"spark-submit",
"--deploy-mode",
"cluster",
"--master",
"yarn",
"--conf",
"spark.yarn.submit.waitAppCompletion=true",
read_write(read_path='path1',write_path='write_path1')
]
}
},
{
'Name': 'test2',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
"spark-submit",
"--deploy-mode",
"cluster",
"--master",
"yarn",
"--conf",
"spark.yarn.submit.waitAppCompletion=true",
read_write(read_path='path2',write_path='write_path2')
]
}
}
]
)
Why not just turn on the concurrent steps?
StepConcurrencyLevel (integer) -- Specifies the number of steps that can be executed concurrently. The default value is 1 . The maximum value is 256 .
Check the documentation for more details.
Related
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
from pandas import DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Optimize BigQuery Storage') \
.config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.24.2') \
.getOrCreate()
df = spark.read.text('gs://test111/test/test.txt', lineSep=",")
#df.selectExpr("split(value, ',') as\
#Text_Data_In_Rows_Using_Text").show(4,False)
df.show()
sProjectID = 'prj-d-xxxx-ingest'
sTargetDataset = 'data'
sTargetTable = 'xxx'
client = bigquery.Client()
table_id = 'data.xxx'
# Since string columns use the "object" dtype, pass in a (partial) schema
# to ensure the correct BigQuery data type.
job_config = bigquery.LoadJobConfig(schema=[
bigquery.SchemaField("Source_Code", "STRING"),
bigquery.SchemaField("FLAG", "STRING")
])
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
ERROR as below
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1643, in getattr
AttributeError: 'DataFrame' object has no attribute 'index'
Instead of using google.cloud.bigquery, you can use BigQuery connector with spark to write data to BQ. See BQ connector with spark document.
Assuming that your data is already correct on your df variable. You can apply this code to write the data to BQ:
gcs_bucket="your-gcs-bucket" #If not provided, it will create a temporary bucket for this
df.write.format("bigquery").option("table","dataset.your_table_here").option("temporaryGcsBucket", gcs_bucket).mode("append").save()
#.mode() can be either 'append' or 'overwrite'
I executed the code using spark-submit with Spark BigQuery jar:
spark-submit --jars gs://spark-lib/bigquery/spark-bigquery-latest.jar my_script.py
See logs after running the script:
Directory: /home/hadoop/
module.py
def incr(value):
return int(value + 1)
main.py
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
import sys
sys.path.append('/home/hadoop/')
import module
if __name__ == '__main__':
df = spark.createDataFrame([['a', 1], ['b', 2]], schema=['id', 'value'])
df.show()
print(module.incr(5)) #this works
# this throws module not found error
incr_udf = F.udf(lambda val: module.incr(val), T.IntegerType())
df = df.withColumn('new_value', incr_udf('value'))
df.show()
Spark task nodes do not have access to /home/hadoop/
How do I import module.py from within spark task nodes?
if you are submitting the spark to yarn. the task will be progress launched by user 'yarn' in the worknode and will not have permission to access.
you can add --py-files module.py to your spark-submit command, then you want directly call the function module.py by adding from module import * since they are all in the container now
I want to convert data from Dataframe to RDD, and save it to MongoDB, here is my code:
import pymongo
import pymongo_spark
from pyspark import SparkConf, SparkContext
from pyspark import BasicProfiler
from pyspark.sql import SparkSession
class MyCustomProfiler(BasicProfiler):
def show(self, id):
print("My custom profiles for RDD:%s" % id)
conf = SparkConf().set("spark.python.profile", "true")
spark = SparkSession.builder \
.master("local[*]") \
.appName("Word Count") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# Important: activate pymongo_spark.
pymongo_spark.activate()
on_time_dataframe = spark.read.parquet(r'\data\on_time_performance.parquet')
on_time_dataframe.show()
# Note we have to convert the row to a dict to avoid https://jira.mongodb.org/browse/HADOOP-276
as_dict = on_time_dataframe.rdd.map(lambda row: row.asDict())
as_dict.saveToMongoDB('mongodb://localhost:27017/agile_data_science.on_time_performance')
some errors occurs:
py4j.protocol.Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.saveAsNewAPIHadoopFile.
: java.lang.ClassNotFoundException: com.mongodb.hadoop.io.BSONWritable
I have installed the Mongo-hadoop file; It seems I don't have a Bsonweitable class. I'm not good at java, So I want someone to help me.
I haven't found any solution clear for loading a file e into Couchbase using spark
I am having a file huge file with lot of records similar to this
ID|Content
prd_lct:11118:3|{"type":"prd_lct","lct_nbr":118,"itm_nbr":3,"locations":[{"lct_s12_id":1,"prd_121_typ_cd":1,"fxt_ail_id":"45","fxt_bay_id":"121","lvl_txt":"2"}],"itemDetails":[{"pmy_vbu_nbr":null,"upc_id":"1212121","vnd_mod_id":"1212121"}]}
My code
spark-shell --packages com.couchbase.client:spark-connector_2.11:2.2.0 --conf spark.couchbase.username=username --conf spark.couchbase.password=passrod --conf spark.couchbase.bucket.bucketname="" --conf spark.couchbase.nodes=http://1.2.3.4:18091,http://1.2.3.3:18091,http://1.2.3.5:18091
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.spark._
import com.couchbase.spark.streaming._
import org.apache.spark.sql.{DataFrameReader, SQLContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
val df = spark.read.option("delimiter", "|").option("header", true).csv("/hdfsData/test.doc").toDF()
df.createOrReplaceTempView("TempQuery")
spark.sql("select * from TempQuery").map(pair => { val ID = JsonArray.create()
val content = JsonObject.create().put("ID", ID)
pair._2.map(_.value.getString("Content")).foreach(ID.add)
JsonDocument.create(pair._1, content)
})
.saveToCouchbase()
I know this is wrong , but i just started , new to Scala and Couchbase.
Please let me know your inputs, basically i have the key and value in a file separated by | and I wanted to loaded to the Couchbase
I am trying to write a pyspark DataFrame to Redshift but it results into error:-
java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.avro.AvroFileFormat could not be instantiated
Caused by: java.lang.NoSuchMethodError: org.apache.spark.sql.execution.datasources.FileFormat.$init$(Lorg/apache/spark/sql/execution/datasources/FileFormat;)V
Spark Version: 2.4.1
Spark-submit command: spark-submit --master local[*] --jars ~/Downloads/spark-avro_2.12-2.4.0.jar,~/Downloads/aws-java-sdk-1.7.4.jar,~/Downloads/RedshiftJDBC42-no-awssdk-1.2.20.1043.jar,~/Downloads/hadoop-aws-2.7.3.jar,~/Downloads/hadoop-common-2.7.3.jar --packages com.databricks:spark-redshift_2.11:2.0.1,com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.3,org.apache.hadoop:hadoop-common:2.7.3,org.apache.spark:spark-avro_2.12:2.4.0 script.py
from pyspark.sql import DataFrameReader
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *
import sys
import os
pe_dl_dbname = os.environ.get("REDSHIFT_DL_DBNAME")
pe_dl_host = os.environ.get("REDSHIFT_DL_HOST")
pe_dl_port = os.environ.get("REDSHIFT_DL_PORT")
pe_dl_user = os.environ.get("REDSHIFT_DL_USER")
pe_dl_password = os.environ.get("REDSHIFT_DL_PASSWORD")
s3_bucket_path = "s3-bucket-name/sub-folder/sub-sub-folder"
tempdir = "s3a://{}".format(s3_bucket_path)
driver = "com.databricks.spark.redshift"
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
sc._jsc.hadoopConfiguration().set("fs.s3.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem")
datalake_jdbc_url = 'jdbc:redshift://{}:{}/{}?user={}&password={}'.format(pe_dl_host, pe_dl_port, pe_dl_dbname, pe_dl_user, pe_dl_password)
"""
The table is created in Redshift as follows:
create table adhoc_analytics.testing (name varchar(255), age integer);
"""
l = [('Alice', 1)]
df = spark.createDataFrame(l, ['name', 'age'])
df.show()
df.write \
.format("com.databricks.spark.redshift") \
.option("url", datalake_jdbc_url) \
.option("dbtable", "adhoc_analytics.testing") \
.option("tempdir", tempdir) \
.option("tempformat", "CSV") \
.save()
Databricks Spark-Redshift doesn't work with Spark version 2.4.1,
Here is the version that I maintain to make it work with Spark 2.4.1
https://github.com/goibibo/spark-redshift
How to use it:
pyspark --packages "com.github.goibibo:spark-redshift:v4.1.0" --repositories "https://jitpack.io"