I am trying to reuse a code which I copied from https://www.opsguru.io/post/solution-walkthrough-visualizing-daily-cloud-spend-on-gcp-using-gke-dataflow-bigquery-and-grafana. Am not too familiar with python as such seek for help here. Trying to copy GCP Bigquery data into Postgres
I have done some modification to the code and am getting some error due to my mistake or code
Here is what I have
import uuid
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions, GoogleCloudOptions, WorkerOptions
from beam_nuggets.io import relational_db
from apache_beam.io.gcp import bigquery
parser = argparse.ArgumentParser()
args = parser.parse_args()
project = args.project("project", help="Enter Project ID")
job_name = args.job_name + str(uuid.uuid4())
bigquery_source = args.bigquery_source
postgresql_user = args.postgresql_user
postgresql_password = args.postgresql_password
postgresql_host = args.postgresql_host
postgresql_port = args.postgresql_port
postgresql_db = args.postgresql_db
postgresql_table = args.postgresql_table
staging_location = args.staging_location
temp_location = args.temp_location
subnetwork = args.subnetwork
options = PipelineOptions(
flags=["--requirements_file", "/opt/python/requirements.txt"])
# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = project
google_cloud_options.job_name = job_name
google_cloud_options.staging_location = staging_location
google_cloud_options.temp_location = temp_location
google_cloud_options.region = "europe-west4"
worker_options = options.view_as(WorkerOptions)
worker_options.zone = "europe-west4-a"
worker_options.subnetwork = subnetwork
worker_options.max_num_workers = 20
options.view_as(StandardOptions).runner = 'DataflowRunner'
start_date = define_start_date()
with beam.Pipeline(options=options) as p:
rows = p | 'QueryTableStdSQL' >> beam.io.Read(beam.io.BigQuerySource(
query = 'SELECT \
billing_account_id, \
service.id as service_id, \
service.description as service_description, \
sku.id as sku_id, \
sku.description as sku_description, \
usage_start_time, \
usage_end_time, \
project.id as project_id, \
project.name as project_description, \
TO_JSON_STRING(project.labels) \
as project_labels, \
project.ancestry_numbers \
as project_ancestry_numbers, \
TO_JSON_STRING(labels) as labels, \
TO_JSON_STRING(system_labels) as system_labels, \
location.location as location_location, \
location.country as location_country, \
location.region as location_region, \
location.zone as location_zone, \
export_time, \
cost, \
currency, \
currency_conversion_rate, \
usage.amount as usage_amount, \
usage.unit as usage_unit, \
usage.amount_in_pricing_units as \
usage_amount_in_pricing_units, \
usage.pricing_unit as usage_pricing_unit, \
TO_JSON_STRING(credits) as credits, \
invoice.month as invoice_month cost_type \
FROM `' + project + '.' + bigquery_source + '` \
WHERE export_time >= "' + start_date + '"', use_standard_sql=True))
source_config = relational_db.SourceConfiguration(
drivername='postgresql+pg8000',
host=postgresql_host,
port=postgresql_port,
username=postgresql_user,
password=postgresql_password,
database=postgresql_db,
create_if_missing=True,
)
table_config = relational_db.TableConfiguration(
name=postgresql_table,
create_if_missing=True
)
rows | 'Writing to DB' >> relational_db.Write(
source_config=source_config,
table_config=table_config
)
When I run the program am getting the following error:
bq-to-sql.py: error: unrecognized arguments: --project xxxxx --job_name bq-to-sql-job --bigquery_source xxxxxxxx
--postgresql_user xxxxx --postgresql_password xxxxx --postgresql_host xx.xx.xx.xx --postgresql_port 5432 --postgresql_db xxxx --postgresql_table xxxx --staging_location g
s://xxxxx-staging --temp_location gs://xxxxx-temp --subnetwork regions/europe-west4/subnetworks/xxxx
argparse needs to be configured. Argparse works like magic, but it does need configuration. These lines are needed between line 10 parser = argparse.ArgumentParser() and line 11 args = parser.parse_args()
parser.add_argument("--project")
parser.add_argument("--job_name")
parser.add_argument("--bigquery_source")
parser.add_argument("--postgresql_user")
parser.add_argument("--postgresql_password")
parser.add_argument("--postgresql_host")
parser.add_argument("--postgresql_port")
parser.add_argument("--postgresql_db")
parser.add_argument("--postgresql_table")
parser.add_argument("--staging_location")
parser.add_argument("--temp_location")
parser.add_argument("--subnetwork")
Argparse is a useful library. I recommend adding a lot of options to these add_argument calls.
Related
This is my first Kafka project (with Spark streaming)
I am trying to read a Kafka topic which is getting data from an upstream source.
They are pushing data into the Kafka topic in the below way:
def kafka_ingest(df: DataFrame, kafkaconfig: dict, topic_name: str):
jaas_config = kafkaconfig['jaas_config'] + \
f" oauth.client.id='{kafkaconfig['client_id']}'" + \
f" oauth.client.secret='{kafkaconfig['client_secret']}'" + \
f" oauth.token.endpoint.uri='{kafkaconfig['endpoint_uri']}'" + \
" oauth.max.token.expiry.seconds='30000' ;"
df.write.format('kafka') \
.option('kafka.bootstrap.servers', kafkaconfig['kafka_broker']) \
.option('kafka.batch.size', kafkaconfig['kafka_batch_size']) \
.option('retries', kafkaconfig['retries']) \
.option('kafka.max.block.ms', kafkaconfig['kafka_max_block_ms']) \
.option('kafka.metadata.max.age.ms', kafkaconfig['kafka_metadata_max_age_ms']) \
.option('kafka.request.timeout.ms', kafkaconfig['kafka_request_timeout_ms']) \
.option('kafka.linger.ms', kafkaconfig['kafka_linger_ms']) \
.option('kafka.delivery.timeout.ms', kafkaconfig['kafka_delivery_timeout_ms']) \
.option('acks', kafkaconfig['acks']) \
.option('kafka.security.protocol', kafkaconfig['kafka_security_protocol']) \
.option('kafka.sasl.jaas.config', jaas_config) \
.option('kafka.sasl.login.callback.handler.class', kafkaconfig['kafka_sasl_login_callback_handler_class']) \
.option('kafka.sasl.mechanism', kafkaconfig['kafka_sasl_mechanism']) \
.option('topic', topic_name) \
.save()
When ingesting data into Kafka, I am using the above method in a foreachBatch method where I also mention the corresponding checkpoint as given below.
def write_stream_batches(kafka_df: DataFrame, checkpoint_location: str):
kafka_df.writeStream \
.format('kafka') \
.foreachBatch(join_kafka_streams_po_denorm) \
.option('checkpointLocation', checkpoint_location) \
.start() \
.awaitTermination()
def join_kafka_streams_po_denorm(kafka_df: DataFrame, batch_id: int):
final_df = kafka_df.some_transformations
kafka_ingest(kafka_ingest, kafkaconfig, topic_name)
I am reading data from the topic as:
def extract_kafka_data(kafka_config: dict, topic_name: str, column_schema: str, checkpoint_location: str):
schema = extract_schema(column_schema)
jass_config = kafka_config['jaas_config'] \
+ " oauth.token.endpoint.uri=" + '"' + kafka_config['endpoint_uri'] + '"' \
+ " oauth.client.id=" + '"' + kafka_config['client_id'] + '"' \
+ " oauth.client.secret=" + '"' + kafka_config['client_secret'] + '" ;'
stream_df = spark.readStream \
.format('kafka') \
.option('kafka.bootstrap.servers', kafka_config['kafka_broker']) \
.option('subscribe', topic_name) \
.option('kafka.security.protocol', kafka_config['kafka_security_protocol']) \
.option('kafka.sasl.mechanism', kafka_config['kafka_sasl_mechanism']) \
.option('kafka.sasl.jaas.config', jass_config) \
.option('kafka.sasl.login.callback.handler.class', kafka_config['kafka_sasl_login_callback_handler_class']) \
.option('startingOffsets', 'earliest') \
.option('fetchOffset.retryIntervalMs', kafka_config['kafka_fetch_offset_retry_intervalms']) \
.option('fetchOffset.numRetries', kafka_config['retries']) \
.option('failOnDataLoss', 'False') \
.option('checkpointLocation', checkpoint_location) \
.load() \
.select(from_json(col('value').cast('string'), schema).alias("json_dta")).selectExpr('json_dta.*')
return stream_df
Every time I display data from the dataframe I see same data returning:
Read 1:
df = extract_kafka_data(kafka_config, topic_name, column_schema, checkpoint_location)
display(df)
output:
+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance |10 |
|Marketing|20 |
|Sales |30 |
|IT |40 |
+---------+-------+
I have 4 records in my topic which I ingested using the method: kafka_ingest. Now that I have read all 4 records, I am expecting the no output if I read the topic again.
Read 2:
df = extract_kafka_data(kafka_config, topic_name, column_schema, checkpoint_location)
display(df)
output:
+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance |10 |
|Marketing|20 |
|Sales |30 |
|IT |40 |
+---------+-------+
Once I read the data in Read 1, I shouldn't see the same data again as per the offset mechanism in the topic.
But the job is returning the same data as it was for Read 1.
Is there something wrong with the way I have setup the offset strategy and the usage of checkpointing ?
Could anyone let me know what is the mistake I am doing here ?
Any help is massively appreciated.
I have created a delta table and now I'm trying to perform merge data to that table using foreachBatch(). I've followed this example. I am running this code in dataproc image 1.5x in google cloud.
Spark version 2.4.7
Delta version 0.6.0
My code looks as follows:
from delta.tables import *
spark = SparkSession.builder \
.appName("streaming_merge") \
.master("local[*]") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()
# Function to upsert `microBatchOutputDF` into Delta table using MERGE
def mergeToDelta(microBatchOutputDF, batchId):
(deltaTable.alias("accnt").merge(
microBatchOutputDF.alias("updates"), \
"accnt.acct_nbr = updates.acct_nbr") \
.whenMatchedDelete(condition = "updates.cdc_ind='D'") \
.whenMatchedUpdateAll(condition = "updates.cdc_ind='U'") \
.whenNotMatchedInsertAll(condition = "updates.cdc_ind!='D'") \
.execute()
)
deltaTable = DeltaTable.forPath(spark, "gs:<<path_for_the_target_delta_table>>")
# Define the source extract
SourceDF = (
spark.readStream \
.format("delta") \
.load("gs://<<path_for_the_source_delta_location>>")
# Start the query to continuously upsert into target tables in update mode
SourceDF.writeStream \
.format("delta") \
.outputMode("update") \
.foreachBatch(mergeToDelta) \
.option("checkpointLocation","gs:<<path_for_the_checkpint_location>>") \
.trigger(once=True) \
.start() \
This code runs without any problems, but there is no data written to the delta table, I doubt foreachBatch is not getting invoked. Anyone know what I'm doing wrong?
After adding awaitTermination, streaming started working and picked up the latest data from the source and performed the merge on delta target table.
Hi have this issue where am unsure how to get a proper start date for my query, I get the following error and am unsure how to go about fixing it. Can I get help on the time conversion format please?
apache_beam.runners.dataflow.dataflow_runner.DataflowRuntimeException: Dataflow pipeline failed. State: FAILED, Error:
Workflow failed. Causes: S01:QueryTableStdSQL+Writing to DB/ParDo(_WriteToRelationalDBFn) failed., BigQuery execution failed., Error:
Message: No matching signature for operator >= for argument types: TIMESTAMP, INT64. Supported signature: ANY >= ANY at [1:1241]
HTTP Code: 400
My script main query looks like:
with beam.Pipeline(options=options) as p:
rows = p | 'QueryTableStdSQL' >> beam.io.Read(beam.io.BigQuerySource(use_standard_sql=True,
query = 'SELECT \
billing_account_id, \
service.id as service_id, \
service.description as service_description, \
sku.id as sku_id, \
sku.description as sku_description, \
usage_start_time, \
usage_end_time, \
project.id as project_id, \
project.name as project_description, \
TO_JSON_STRING(project.labels) \
as project_labels, \
project.ancestry_numbers \
as project_ancestry_numbers, \
TO_JSON_STRING(labels) as labels, \
TO_JSON_STRING(system_labels) as system_labels, \
location.location as location_location, \
location.country as location_country, \
location.region as location_region, \
location.zone as location_zone, \
export_time, \
cost, \
currency, \
currency_conversion_rate, \
usage.amount as usage_amount, \
usage.unit as usage_unit, \
usage.amount_in_pricing_units as \
usage_amount_in_pricing_units, \
usage.pricing_unit as usage_pricing_unit, \
TO_JSON_STRING(credits) as credits, \
invoice.month as invoice_month, \
cost_type, \
FROM `pprodjectID.bill_usage.gcp_billing_export_v1_xxxxxxxx` \
WHERE export_time >= 2020-01-01'))
source_config = relational_db.SourceConfiguration(
The date format on bigquery console
export_time
2018-01-25 01:18:55.637 UTC
usage_start_time
2018-01-24 21:23:10.643 UTC
You forgot to include as a string the time
WHERE export_time >= 2020-01-01
The above results Calc: 0+2020-01-01=2018 you should have
WHERE export_time >= "2020-01-01"
I have established the connection between Pyspark and Redshift using the following code.
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
import psycopg2
DATABASE = "d"
USER = "user1"
PASSWORD = "1234"
HOST = "sparkvalidation.crv9zfdiseqm.us-west-2.redshift.amazonaws.com"
PORT = "5439"
SCHEMA = "public"
connection_string = "redshift+psycopg2://%s:%s#%s:%s/%s" % (USER,PASSWORD,HOST,str(PORT),DATABASE)
engine = sa.create_engine(connection_string)
session = sessionmaker()
session.configure(bind=engine)
s = session()
SetPath = "SET search_path TO %s" % SCHEMA
s.execute(SetPath)
Now how can I write a pyspark dataframe to Redshift?
If you use DataBricks, you could write something like this:
dataframe.write \
.format("com.databricks.spark.redshift") \
.option("url", connection_string) \
.option("dbtable", "target") \
.option("tempdir", "s3a://your_s3_tmp_bucket/tmp_data") \
.mode("error") \
.save()
Note that you need a s3 bucket, as it's usually the case when copying data into redshift
I have imported a table into HDFS as
fields-terminated-by '|'
sqoop import \
--connect jdbc:mysql://connection \
--username \
--password \
--table products \
--as-textfile \
--target-dir /user/username/productsdemo \
--fields-terminated-by '|'
after that, I am trying to read it as RDD using spark-shell version 1.6.2
var productsRDD = sc.textFile("/user/username/productsdemo")
and converting it into a data frame
var productsDF = productsRDD.map(product =>{
var o = product.split("|");
products(o(0).toInt,o(1).toInt,o(2),o(3),o(4).toFloat,o(5))
}).toDF("product_id", "product_category_id","product_name","product_description","product_price","product_image" )
But When I try to print the output it is throwing the below exception.
java.lang.NumberFormatException: For input string: "|"
Why I am getting this error can anyone help me out of this?
split are use regex to do the split string, since | is a special character in regex means OR You need to use \\| instead of | when split
var o = product.split("\\|");