I am new to AWS glue and I am trying to run some transformation process using pyspark. I successfully ran my ETL but I am looking for another way of converting dataframe to dynamic frame.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
# load data from crawler
students = glueContext.create_dynamic_frame.from_catalog(database="example_db", table_name="samp_csv")
# move data into a new variable for transformation
students_trans = students
# convert dynamicframe(students_trans) to dataframe
students_= students_trans.toDF()
# run transformation change column names/ drop columns
students_1= students_.withColumnRenamed("state","County").withColumnRenamed("capital","cap").drop("municipal",'metropolitan')
#students_1.printSchema()
#convert df back to dynamicframe
from awsglue.dynamicframe import DynamicFrame
students_trans = students_trans.fromDF(students_1, glueContext, "students_trans")
#load into s3 bucket
glueContext.write_dynamic_frame.from_options(frame = students_trans,
connection_type = "s3",
connection_options = {"path": "s3://kingb/target/"},
format = "csv")
from awsglue import DynamicFrame
students_trans = DynamicFrame.fromDF(students_1, self._glue_context, "df")
Related
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
from pandas import DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Optimize BigQuery Storage') \
.config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.24.2') \
.getOrCreate()
df = spark.read.text('gs://test111/test/test.txt', lineSep=",")
#df.selectExpr("split(value, ',') as\
#Text_Data_In_Rows_Using_Text").show(4,False)
df.show()
sProjectID = 'prj-d-xxxx-ingest'
sTargetDataset = 'data'
sTargetTable = 'xxx'
client = bigquery.Client()
table_id = 'data.xxx'
# Since string columns use the "object" dtype, pass in a (partial) schema
# to ensure the correct BigQuery data type.
job_config = bigquery.LoadJobConfig(schema=[
bigquery.SchemaField("Source_Code", "STRING"),
bigquery.SchemaField("FLAG", "STRING")
])
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
ERROR as below
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1643, in getattr
AttributeError: 'DataFrame' object has no attribute 'index'
Instead of using google.cloud.bigquery, you can use BigQuery connector with spark to write data to BQ. See BQ connector with spark document.
Assuming that your data is already correct on your df variable. You can apply this code to write the data to BQ:
gcs_bucket="your-gcs-bucket" #If not provided, it will create a temporary bucket for this
df.write.format("bigquery").option("table","dataset.your_table_here").option("temporaryGcsBucket", gcs_bucket).mode("append").save()
#.mode() can be either 'append' or 'overwrite'
I executed the code using spark-submit with Spark BigQuery jar:
spark-submit --jars gs://spark-lib/bigquery/spark-bigquery-latest.jar my_script.py
See logs after running the script:
I am working on an AWS Glue job. I am using scala to write the code. I need to get the workflow runtime properties. I can do this very easily in python. However i could not find any sample code or documentation to do this in scala.
Equivalent code in python is as follows.
I will be very grateful if someone can help me with the scala equivalent.
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from pyspark.context import SparkContext
glue_client = boto3.client("glue")
args = getResolvedOptions(sys.argv, ['JOB_NAME','WORKFLOW_NAME', 'WORKFLOW_RUN_ID'])
workflow_name = args['WORKFLOW_NAME']
workflow_run_id = args['WORKFLOW_RUN_ID']
workflow_params = glue_client.get_workflow_run_properties(Name=workflow_name,
RunId=workflow_run_id)["RunProperties"]
target_database = workflow_params['target_database']
target_s3_location = workflow_params['target_s3_location']
This worked for me.
import com.amazonaws.regions.Regions
import com.amazonaws.services.glue.{AWSGlue, AWSGlueClient}
import com.amazonaws.services.glue.model.GetWorkflowRunPropertiesRequest
import com.amazonaws.services.glue.model.GetWorkflowRunPropertiesResult
import com.amazonaws.services.glue.GlueContext
import com.amazonaws.services.glue.util.GlueArgParser
import com.amazonaws.services.glue.util.Job
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import com.amazonaws.services.glue.GlueContext
object ReadProps {
def main(sysArgs: Array[String]) {
val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME","WORKFLOW_NAME", "WORKFLOW_RUN_ID").toArray)
val workflowName= args("WORKFLOW_NAME")
val workflowId = args("WORKFLOW_RUN_ID")
val sc: SparkContext = new SparkContext()
val glueContext: GlueContext = new GlueContext(sc)
val sparkSession: SparkSession = glueContext.getSparkSession
val region = Regions.fromName("your-region-name")
val glue = AWSGlueClient.builder().withRegion(region).build()
val req = new GetWorkflowRunPropertiesRequest()
req.setName(workflowName)
req.setRunId(workflowId)
val result = glue.getWorkflowRunProperties(req)
val resultMap = result.getRunProperties()
println(resultMap.get("propertykey"))
}
}
Hi i am working AWS glue spark job.
I have to pull the records from a RDS Aurora Postgres table and creating a dynamic frame from it and push record by record in SQS.
can someone help here
#######################################
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
sqs = boto3.resource('sqs')
sqs_queue_url = f"https://sqs.us-east-1.amazonaws.com/ACCID/SampleQue"
queue = sqs.Queue(sqs_queue_url)
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glueContext)
## #params: [JOB_NAME]
job.init(args['JOB_NAME'], args)
logger = glueContext.get_logger()
df = glueContext.create_dynamic_frame.from_options(catalog_connection = "ams-connection", connection_type="postgreasql",
connection_options={
"dbtable": "input",
"database": "ams"
})
job.commit()
# iterate over dynamic frame and send each record over the sqs queue
for record in df:
queue.send_message(MessageBody=record)
I would like to know the PySpark equivalent of the following code in Scala. I am using databricks. I need the same output as below:-
to create new Spark session and output the session id (SparkSession#123d0e8)
val new_spark = spark.newSession()
**Output**
new_spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession#123d0e8
to view SparkContext and output the SparkContext id (SparkContext#2dsdas33)
new_spark.sparkContext
**Output**
org.apache.spark.SparkContext = org.apache.spark.SparkContext#2dsdas33
It's very similar. If you have already a session and want to open another one, you can use
my_session = spark.newSession()
print(my_session)
This will produce the new session object I think you are trying to create
<pyspark.sql.session.SparkSession object at 0x7fc3bae3f550>
spark is a session object already running, because you are using a databricks notebook
SparkSession could be created as http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html
>>> from pyspark.sql import SparkSession
>>> from pyspark.conf import SparkConf
>>> SparkSession.builder.config(conf=SparkConf())
or
>>> from pyspark.sql import SparkSession
>>> spark = SparkSession.builder.appName('FirstSparkApp').getOrCreate()
I have searched on stack overflow to find out how to append a new column with source file names as values.
But, it didn't work out as expected.
In my final parquet file, I found a new column named input_file_name, but the value is empty. (like "")
I am wondering which step I neglected.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import functions as F
from awsglue.dynamicframe import DynamicFrame
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "mydb", table_name = "mytable", transformation_ctx = "datasource0")
datasource1 = datasource0.toDF().withColumn("input_file_name", F.input_file_name())
datasource2 = DynamicFrame.fromDF(datasource1, glueContext, "datasource2")
applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("input_file_name", "string", "input_file_name", "string"),
("Profile", "struct", "Profile","struct")], transformation_ctx = "applymapping1")
datasink4 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://temp/testing"}, format = "parquet", transformation_ctx = "datasink4")
job.commit()
You cant use input_file_name() by converting dynamic frame to dataframe
You must use spark.read api to read data to dataframe. Next immediate step must be to use input_file_name() - before performing any operation on dataframe.
Remember aws glue has known issue that it works with upto 45gb data then it throws error. Increasing dpus won't work