from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
from pandas import DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Optimize BigQuery Storage') \
.config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.24.2') \
.getOrCreate()
df = spark.read.text('gs://test111/test/test.txt', lineSep=",")
#df.selectExpr("split(value, ',') as\
#Text_Data_In_Rows_Using_Text").show(4,False)
df.show()
sProjectID = 'prj-d-xxxx-ingest'
sTargetDataset = 'data'
sTargetTable = 'xxx'
client = bigquery.Client()
table_id = 'data.xxx'
# Since string columns use the "object" dtype, pass in a (partial) schema
# to ensure the correct BigQuery data type.
job_config = bigquery.LoadJobConfig(schema=[
bigquery.SchemaField("Source_Code", "STRING"),
bigquery.SchemaField("FLAG", "STRING")
])
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
ERROR as below
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1643, in getattr
AttributeError: 'DataFrame' object has no attribute 'index'
Instead of using google.cloud.bigquery, you can use BigQuery connector with spark to write data to BQ. See BQ connector with spark document.
Assuming that your data is already correct on your df variable. You can apply this code to write the data to BQ:
gcs_bucket="your-gcs-bucket" #If not provided, it will create a temporary bucket for this
df.write.format("bigquery").option("table","dataset.your_table_here").option("temporaryGcsBucket", gcs_bucket).mode("append").save()
#.mode() can be either 'append' or 'overwrite'
I executed the code using spark-submit with Spark BigQuery jar:
spark-submit --jars gs://spark-lib/bigquery/spark-bigquery-latest.jar my_script.py
See logs after running the script:
Related
I am new to AWS glue and I am trying to run some transformation process using pyspark. I successfully ran my ETL but I am looking for another way of converting dataframe to dynamic frame.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
# load data from crawler
students = glueContext.create_dynamic_frame.from_catalog(database="example_db", table_name="samp_csv")
# move data into a new variable for transformation
students_trans = students
# convert dynamicframe(students_trans) to dataframe
students_= students_trans.toDF()
# run transformation change column names/ drop columns
students_1= students_.withColumnRenamed("state","County").withColumnRenamed("capital","cap").drop("municipal",'metropolitan')
#students_1.printSchema()
#convert df back to dynamicframe
from awsglue.dynamicframe import DynamicFrame
students_trans = students_trans.fromDF(students_1, glueContext, "students_trans")
#load into s3 bucket
glueContext.write_dynamic_frame.from_options(frame = students_trans,
connection_type = "s3",
connection_options = {"path": "s3://kingb/target/"},
format = "csv")
from awsglue import DynamicFrame
students_trans = DynamicFrame.fromDF(students_1, self._glue_context, "df")
Directory: /home/hadoop/
module.py
def incr(value):
return int(value + 1)
main.py
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
import sys
sys.path.append('/home/hadoop/')
import module
if __name__ == '__main__':
df = spark.createDataFrame([['a', 1], ['b', 2]], schema=['id', 'value'])
df.show()
print(module.incr(5)) #this works
# this throws module not found error
incr_udf = F.udf(lambda val: module.incr(val), T.IntegerType())
df = df.withColumn('new_value', incr_udf('value'))
df.show()
Spark task nodes do not have access to /home/hadoop/
How do I import module.py from within spark task nodes?
if you are submitting the spark to yarn. the task will be progress launched by user 'yarn' in the worknode and will not have permission to access.
you can add --py-files module.py to your spark-submit command, then you want directly call the function module.py by adding from module import * since they are all in the container now
I want to convert data from Dataframe to RDD, and save it to MongoDB, here is my code:
import pymongo
import pymongo_spark
from pyspark import SparkConf, SparkContext
from pyspark import BasicProfiler
from pyspark.sql import SparkSession
class MyCustomProfiler(BasicProfiler):
def show(self, id):
print("My custom profiles for RDD:%s" % id)
conf = SparkConf().set("spark.python.profile", "true")
spark = SparkSession.builder \
.master("local[*]") \
.appName("Word Count") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# Important: activate pymongo_spark.
pymongo_spark.activate()
on_time_dataframe = spark.read.parquet(r'\data\on_time_performance.parquet')
on_time_dataframe.show()
# Note we have to convert the row to a dict to avoid https://jira.mongodb.org/browse/HADOOP-276
as_dict = on_time_dataframe.rdd.map(lambda row: row.asDict())
as_dict.saveToMongoDB('mongodb://localhost:27017/agile_data_science.on_time_performance')
some errors occurs:
py4j.protocol.Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.saveAsNewAPIHadoopFile.
: java.lang.ClassNotFoundException: com.mongodb.hadoop.io.BSONWritable
I have installed the Mongo-hadoop file; It seems I don't have a Bsonweitable class. I'm not good at java, So I want someone to help me.
I would like to know the PySpark equivalent of the following code in Scala. I am using databricks. I need the same output as below:-
to create new Spark session and output the session id (SparkSession#123d0e8)
val new_spark = spark.newSession()
**Output**
new_spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession#123d0e8
to view SparkContext and output the SparkContext id (SparkContext#2dsdas33)
new_spark.sparkContext
**Output**
org.apache.spark.SparkContext = org.apache.spark.SparkContext#2dsdas33
It's very similar. If you have already a session and want to open another one, you can use
my_session = spark.newSession()
print(my_session)
This will produce the new session object I think you are trying to create
<pyspark.sql.session.SparkSession object at 0x7fc3bae3f550>
spark is a session object already running, because you are using a databricks notebook
SparkSession could be created as http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html
>>> from pyspark.sql import SparkSession
>>> from pyspark.conf import SparkConf
>>> SparkSession.builder.config(conf=SparkConf())
or
>>> from pyspark.sql import SparkSession
>>> spark = SparkSession.builder.appName('FirstSparkApp').getOrCreate()
I want to create a decision tree model using spark submit.
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
dt = df.rdd.map(createLabeledPoints)
model_dt = DecisionTree.trainClassifier(dt, numClasses=467, categoricalFeaturesInfo={0:2,1:2, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 8:2, 9:2, 10:2, 11:2, 12:2, 13:2, 14:2, 15:2, 16:2, 17:2, 18:2, 19:2, 20:2, 21:2, 22:2, 23:2, 24:2, 25:2, 26:2, 27:2, 28:2, 29:2, 30:2, 31:2, 32:2, 33:2, 34:2, 35:2, 36:2, 37:2, 38:2}, impurity='gini', maxDepth=30, maxBins=32)
where createLabeledPoints is a function that return to me a labeledpoint
I have no issue when I execute this code using pyspark in the spark-shell
but I want to use spark-submit, when I do that its gives me this error
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
I think the problem is because I create another sparkSession inside spark-submit (I think) or because pysparksataframe cannot be pickled!
Can anyone please help me !