TimeStamp convertion from aws glue while transfering data to redshift - pyspark

I have a file in S3, we are importing it to redshift using Glue.
The crawler part is done.
One column the data is datetime type but not properly format, so the clawler not able to identify and marked it as string.
Now I have created the table in redshift and mark the column datatype is timestamp, now while creating the job where and what need to change in the script so the string converted to redshift timestamp.
The format of date in S3 file is 'yyyy.mm.dd HH:mi:ss';
and the script is below.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## #params: [TempDir, JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource
## #args: [database = "", table_name = "", transformation_ctx = "datasource0"]
## #return: datasource0
## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "", table_name = "", transformation_ctx = "datasource0")
## #type: ApplyMapping
## #args: [mapping = [("mrp", "long", "mrp", "decimal(10,2)"), ("mop", "double", "mop", "decimal(10,2)"), ("mop_update_timestamp", "string", "mop_update_timestamp", "timestamp"), ("special_price", "long", "special_price", "decimal(10,2)"), ("promotion_identifier", "string", "promotion_identifier", "string"), ("is_percentage_promotion", "string", "is_percentage_promotion", "string"), ("promotion_value", "string", "promotion_value", "decimal(10,2)"), ("max_discount", "long", "max_discount", "decimal(10,2)"), ("promotion_start_date", "string", "promotion_start_date", "timestamp"), ("promotion_end_date", "string", "promotion_end_date", "timestamp")], transformation_ctx = "applymapping1"]
## #return: applymapping1
## #inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [ ("mrp", "long", "mrp", "decimal(10,2)"), ("mop", "double", "mop", "decimal(10,2)"), ("mop_update_timestamp", "string", "mop_update_timestamp", "timestamp"), ("special_price", "long", "special_price", "decimal(10,2)"), ("promotion_identifier", "string", "promotion_identifier", "string"), ("is_percentage_promotion", "string", "is_percentage_promotion", "string"), ("promotion_value", "string", "promotion_value", "decimal(10,2)"), ("max_discount", "long", "max_discount", "decimal(10,2)"), ("promotion_start_date", "string", "promotion_start_date", "timestamp"), ("promotion_end_date", "string", "promotion_end_date", "timestamp")], transformation_ctx = "applymapping1")
## #type: ResolveChoice
## #args: [choice = "make_cols", transformation_ctx = "resolvechoice2"]
## #return: resolvechoice2
## #inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame = applymapping1, choice = "make_cols", transformation_ctx = "resolvechoice2")
## #type: DropNullFields
## #args: [transformation_ctx = "dropnullfields3"]
## #return: dropnullfields3
## #inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
## #type: DataSink
## #args: [catalog_connection = "", connection_options = {"dbtable": "", "database": ""}, redshift_tmp_dir = TempDir, transformation_ctx = "datasink4"]
## #return: datasink4
## #inputs: [frame = dropnullfields3]
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = dropnullfields3, catalog_connection = "", connection_options = {"dbtable": "", "database": ""}, redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink4")
job.commit()

Have you tried to make it into a dataframe and then cast to timestamp since you have it in 'yyyy.mm.dd HH:mi:ss' format? Something like this:
## Add this in order to use DynamicFrame.fromDF
from awsglue.dynamicframe import DynamicFrame
## Make a dataframe
df_datasource0 = datasource0.toDF()
## add a column mop_update_timestamp_ts where you cast mop_update_timestamp to a timestamp
df_datasource0 = df_datasource0.withColumn('mop_update_timestamp_ts',df_datasource0.mop_update_timestamp.cast('timestamp'))
## Transform the dataframe back to a dynamic frame again
datasource0 = DynamicFrame.fromDF(df_datasource0, glueContext, "datasource0")
## Use the mop_update_timestamp_ts column instead as below.
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [ ("mrp", "long", "mrp", "decimal(10,2)"), ("mop", "double", "mop", "decimal(10,2)"), ("mop_update_timestamp_ts", "timestamp", "mop_update_timestamp", "timestamp"), ("special_price", "long", "special_price", "decimal(10,2)"), ("promotion_identifier", "string", "promotion_identifier", "string"), ("is_percentage_promotion", "string", "is_percentage_promotion", "string"), ("promotion_value", "string", "promotion_value", "decimal(10,2)"), ("max_discount", "long", "max_discount", "decimal(10,2)"), ("promotion_start_date", "string", "promotion_start_date", "timestamp"), ("promotion_end_date", "string", "promotion_end_date", "timestamp")], transformation_ctx = "applymapping1")
Let me know if it works for you

I had a similar problem converting a string to timestamp with PySpark.
The way I did it so that it appears in Athena with type timestamp was to import the spark functions with an alias to avoid name clashing and then create a new column with timestamp datatype, convert the values and finally write that column to S3 and it's picked up by Athena.
from pyspark.sql import functions as spark_f # avoid name clash
dyf = glue_context.create_dynamic_frame.from_options...
df = dyf.toDF()
modified_df = df.withColumn("value_timestamp",
spark_f.to_timestamp(lit(col("value_string")), 'yyyy/MM/dd HH:mm'))
modified_dyf = DynamicFrame.fromDF(modified_df, glue_context, "modified_dyf")
Transform = ApplyMapping.apply(frame=modified_dyf,
mappings=[
( ('value_timestamp', 'timestamp', 'column-name',
'timestamp'),

Related

How to upload files from local to staging table in Snowflake using Spark

I was using Snowflake connector for the same purpose in Python.
The code I had used was:
import snowflake.connector as sf
conn = sf.connect(user=user, password=password, account=account, warehouse=warehouse,
database=database,schema=schema)
def execute_query(connection, query):
cursor = connection.cursor()
cursor.execute(query)
cursor.close()
query = "create or replace stage table_stage file_format = (TYPE=CSV);"
execute_query(conn, query)
query = "put file://local_file.csv #table_stage auto_compress=true"
execute_query(conn, query)
Now, I need to achieve the same using Spark, the code I'm using is:
sfOptions = {
"sfURL": "url",
"sfAccount": "account",
"sfUser": "user",
"sfPassword": "user",
"sfDatabase": "database",
"sfSchema": "PUBLIC",
"sfWarehouse": "warehouse"
}
spark.sparkContext.jvm.net.snowflake.spark.snowflake.Utils.runQuery(sfOptions,
"create or replace stage table_stage file_format = (TYPE=CSV); "
spark.sparkContext.jvm.net.snowflake.spark.snowflake.Utils.runQuery(sfOptions,
"put file://local_file.csv #table_stage auto_compress=true"
I'm able to create staging table with this, but not able to upload the files.
Please suggest any alternative method for doing the same.

How to Write AWS Glue Script to Insert New Data into a Redshift Table

I'm brand new to AWS Glue and want to create a job that will take a SQL script I've written (an INSERT INTO statement) and populate an empty table I have in Redshift. Is this possible? If so, what is the syntax?
I've started with a test case. Copy data from one table in my Redshift to another.
This is the script proposed by AWS. I selected the "Change Schema" option because I wanted to create a new target dataset.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## #params: [TempDir, JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource
## #args: [database = "dev", table_name = "patients", redshift_tmp_dir = TempDir, transformation_ctx = "datasource0"]
## #return: datasource0
## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "dev", table_name = "patients", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasource0")
## #type: ApplyMapping
## #args: [mapping = [("birthdate", "string", "date of birth", "string"), ("_id", "string", "patient id", "string"), ("name_middle", "string", "patient middle name", "string"), ("gender", "string", "gender", "string"), ("name_family", "string", "patient last name", "string"), ("name_given", "string", "patient first name", "string")], transformation_ctx = "applymapping1"]
## #return: applymapping1
## #inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("birthdate", "string", "date of birth", "string"), ("_id", "string", "patient id", "string"), ("name_middle", "string", "patient middle name", "string"), ("gender", "string", "gender", "string"), ("name_family", "string", "patient last name", "string"), ("name_given", "string", "patient first name", "string")], transformation_ctx = "applymapping1")
## #type: SelectFields
## #args: [paths = ["gender", "patient middle name", "patient last name", "patient first name", "patient id", "date of birth"], transformation_ctx = "selectfields2"]
## #return: selectfields2
## #inputs: [frame = applymapping1]
selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["gender", "patient middle name", "patient last name", "patient first name", "patient id", "date of birth"], transformation_ctx = "selectfields2")
## #type: ResolveChoice
## #args: [choice = "MATCH_CATALOG", database = "dev", table_name = "patients_info", transformation_ctx = "resolvechoice3"]
## #return: resolvechoice3
## #inputs: [frame = selectfields2]
resolvechoice3 = ResolveChoice.apply(frame = selectfields2, choice = "MATCH_CATALOG", database = "dev", table_name = "patients_info", transformation_ctx = "resolvechoice3")
## #type: ResolveChoice
## #args: [choice = "make_cols", transformation_ctx = "resolvechoice4"]
## #return: resolvechoice4
## #inputs: [frame = resolvechoice3]
resolvechoice4 = ResolveChoice.apply(frame = resolvechoice3, choice = "make_cols", transformation_ctx = "resolvechoice4")
## #type: DataSink
## #args: [database = "dev", table_name = "patients_info", redshift_tmp_dir = TempDir, transformation_ctx = "datasink5"]
## #return: datasink5
## #inputs: [frame = resolvechoice4]
datasink5 = glueContext.write_dynamic_frame.from_catalog(frame = resolvechoice4, database = "dev", table_name = "patients_info", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink5")
job.commit()
And then I tried a simple use case that still failed:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
persons = glueContext.create_dynamic_frame.from_catalog(
database = "dev",
table_name = "patients",
redshift_tmp_dir = args["TempDir"],
additional_options = {"aws_iam_role": "arn:aws:iam::account-id:role/role-name"})
print "Count: ", persons.count()
persons.printSchema()
You should not be considering "insert into" as a way to write data to redshift, it is very slow.
The correct process is:
Write data to s3
Copy data from s3 into redshift using redshift COPY command
You should read this AWS doc carefully.
https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-redshift.html

Field does not exist on transformations to extract key with Debezium

I am trying to create a Debezium MySQL connector with a transformation to extract the key.
Before key transformations:
create source connector mysql with(
"connector.class" = 'io.debezium.connector.mysql.MySqlConnector',
"database.hostname" = 'mysql',
"tasks.max" = '1',
"database.port" = '3306',
"database.user" = 'debezium',
"database.password" = 'dbz',
"database.server.id" = '42',
"database.server.name" = 'before',
"table.whitelist" = 'deepprices.deepprices',
"database.history.kafka.bootstrap.servers" = 'kafka:29092',
"database.history.kafka.topic" = 'dbz.deepprices',
"include.schema.changes" = 'true',
"transforms" = 'unwrap',
"transforms.unwrap.type" = 'io.debezium.transforms.UnwrapFromEnvelope');
Topic results are :
> rowtime: 2020/05/20 16:47:23.354 Z, key: [St#5778462697648631933/8247607644536792125], value: {"id": "P195910", "price": "1511.64"}
When the key.converter is set to JSON, Key becomes {"id": "P195910"}
So, I want to extract id from key and make it a string key:
Expected results :
rowtime: 2020/05/20 16:47:23.354 Z,
key: 'P195910',
value: {"id": "P195910", "price": "1511.64"}
While trying to use a transformation with ExtractField or ValueToKey I get:
DataException: Field does not exist: id:
My try with instruction containing ValueToKey:
create source connector mysql with(
"connector.class" = 'io.debezium.connector.mysql.MySqlConnector',
"database.hostname" = 'mysql',
"tasks.max" = '1',
"database.port" = '3306',
"database.user" = 'debezium',
"database.password" = 'dbz',
"database.server.id" = '42',
"database.server.name" = 'after',
"table.whitelist" = 'deepprices.deepprices',
"database.history.kafka.bootstrap.servers" = 'kafka:29092',
"database.history.kafka.topic" = 'dbz.deepprices',
"include.schema.changes" = 'true',
"key.converter" = 'org.apache.kafka.connect.json.JsonConverter',
"key.converter.schemas.enable" = 'TRUE',
"value.converter" = 'org.apache.kafka.connect.json.JsonConverter',
"value.converter.schemas.enable" = 'TRUE',
"transforms" = 'unwrap,createkey',
"transforms.unwrap.type" = 'io.debezium.transforms.UnwrapFromEnvelope',
"transforms.createkey.type" = 'org.apache.kafka.connect.transforms.ValueToKey',
"transforms.createkey.fields" = 'id'
);
Causes the following error in my Kafka-connect log:
Caused by: org.apache.kafka.connect.errors.DataException: Field does not exist: id
at org.apache.kafka.connect.transforms.ValueToKey.applyWithSchema(ValueToKey.java:89)
at org.apache.kafka.connect.transforms.ValueToKey.apply(ValueToKey.java:67)
Changing the transformation type from UnwrapFromEnvelope to ExtractNewRecordState, solved the issue on Debezium MySQL CDC Connector, version 1.1.0.
transforms.unwrap.type" = 'io.debezium.transforms.ExtractNewRecordState'
Since you're using ksqlDB here you'll want to set your source connector to write the key as a String:
key.converter=org.apache.kafka.connect.storage.StringConverter

PostgresOperator in Airflow getting error while passing parameter

I have a dag which queries the postgress database, And I am using postgresOperator
however when passing the parameter I am getting the below Error.
psycopg2.ProgrammingError: column "132" does not exist
LINE 1: ...d,derived_tstamp FROM atomic.events WHERE event_name = "132"
snapshot of my dag below :
default_args = {
"owner": "airflow",
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
"email": ["airflow#airflow.com"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=1),
}
dag = DAG("PostgresTest", default_args=default_args, schedule_interval='3,33 * * * *',template_searchpath = ['/root/airflow/sql/'])
dailyOperator = PostgresOperator(
task_id='Refresh_DailyScore',
postgres_conn_id='postgress_sophi',
params={"e_name":'"132"'},
sql='atomTest.sql',
dag=dag)
Snapshot of atomTest.sql
SELECT domain_userid,derived_tstamp FROM atomic.events WHERE event_name = {{ params.e_name }}
I am hitting my head the whole day to understand why airflow is considering 132 values as column.
Please suggest.

AvroTypeException: Not an enum: MOBILE on DataFileWriter

I am getting the following error message when I tried to write avro records using build-in AvroKeyValueSinkWriter in Flink 1.3.2 and avro 1.8.2:
My schema looks like this:
{"namespace": "com.base.avro",
"type": "record",
"name": "Customer",
"doc": "v6",
"fields": [
{"name": "CustomerID", "type": "string"},
{"name": "platformAgent", "type": {
"type": "enum",
"name": "PlatformAgent",
"symbols": ["WEB", "MOBILE", "UNKNOWN"]
}, "default":"UNKNOWN"}
]
}
And I am calling the following Flink code to write data:
var properties = new util.HashMap[String, String]()
val stringSchema = Schema.create(Type.STRING)
val myTypeSchema = Customer.getClassSchema
val keySchema = stringSchema.toString
val valueSchema = myTypeSchema.toString
val compress = true
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema)
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema)
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, compress.toString)
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC)
val sink = new BucketingSink[org.apache.flink.api.java.tuple.Tuple2[String, Customer]]("s3://test/flink")
sink.setBucketer(new DateTimeBucketer("yyyy-MM-dd/HH/mm/"))
sink.setInactiveBucketThreshold(120000) // this is 2 minutes
sink.setBatchSize(1024 * 1024 * 64) // this is 64 MB,
sink.setPendingSuffix(".avro")
val writer = new AvroKeyValueSinkWriter[String, Customer](properties)
sink.setWriter(writer.duplicate())
However, it throws the following errors:
Caused by: org.apache.avro.AvroTypeException: Not an enum: MOBILE
at org.apache.avro.generic.GenericDatumWriter.writeEnum(GenericDatumWriter.java:177)
at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:119)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75)
at org.apache.avro.generic.GenericDatumWriter.writeField(GenericDatumWriter.java:166)
at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:156)
at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:118)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75)
at org.apache.avro.generic.GenericDatumWriter.writeField(GenericDatumWriter.java:166)
at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:156)
at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:118)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:62)
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:302)
... 10 more
Please suggest!
UPDATE 1:
I found this is kind of bug in avro 1.8+ based on this ticket: https://issues-test.apache.org/jira/browse/AVRO-1810
It turns out this is an issue with Avro 1.8+, I have to override the version flink uses dependencyOverrides += "org.apache.avro" % "avro" % "1.7.3", the bug can be found here https://issues-test.apache.org/jira/browse/AVRO-1810