Have an issue with JDBC connector Clickhouse - pyspark

I wrote a function that writes spark dataframe into Clickhouse DB like below:
def write_df_to_click_house(df, table_name, clickhouse_connection_obj):
clickhouse_driver = "com.github.housepower.jdbc.ClickHouseDriver"
batch_size = "200000"
num_partitions = "10"
jdbc_url = f"jdbc:clickhouse://**.**.**.**:**/**"
logger.info(jdbc_url)
df.write.mode("append").format("jdbc").option("driver", clickhouse_driver).option(
"user", *****
).option("password", *****).option(
"batchsize", batch_size
).option(
"numPartitions", num_partitions
).jdbc(
jdbc_url, table_name, "append"
)
When this function run and try to load data into Clickhouse I got this error:
Py4JJavaError: An error occurred while calling o450.jdbc.: com.github.housepower.exception.ClickHouseSQLException: RESPONSE_TABLES_STATUS_RESPONSE

Related

AWS GlueStudio RDS -> Redshift invalid timestamp format

I am trying to create an AWS Glue ETL job to move data from Aurora RDS to Redshift, but cannot resolve how to get the timestamp fields properly mapped. All stages of the job show a valid preview of the expected data, but the job always fails with the following error
py4j.protocol.Py4JJavaError: An error occurred while calling o179.pyWriteDynamicFrame.
: java.sql.SQLException:
Error (code 1206) while loading data into Redshift: "Invalid timestamp format or value [YYYY-MM-DD HH24:MI:SS]"
Table name: public.stage_table_ae89e9dffe974b649bbf4852e49a4b12
Column name: updated_at
Column type: timestamp(0)
Raw line: 1234,5341,1121,0,2022-01-06 16:29:55.000000000,2022-01-06 16:29:55.000000000,1,1,Suzy
Raw field value: 0
I have tried doing a date format to remove the microseconds, I have tried forcing quotes around the date fields, nothing works.
Here is the generated script
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame
from awsglue import DynamicFrame
# Script generated for node Custom transform
def CastIntsTransform(glueContext, dfc) -> DynamicFrameCollection:
df = dfc.select(list(dfc.keys())[0])
df_resolved = (
df.resolveChoice(specs=[("id", "cast:bigint")])
.resolveChoice(specs=[("user_id", "cast:bigint")])
.resolveChoice(specs=[("connected_user_id", "cast:bigint")])
.resolveChoice(specs=[("mg_id", "cast:bigint")])
.resolveChoice(specs=[("access_level", "cast:tinyint")])
.resolveChoice(specs=[("status", "cast:tinyint")])
)
return DynamicFrameCollection({"CustomTransform0": df_resolved}, glueContext)
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node JDBC Connection
JDBCConnection_node1 = glueContext.create_dynamic_frame.from_catalog(
database="ABC123",
table_name="user_connections",
transformation_ctx="JDBCConnection_node1",
)
# Script generated for node SQL
SqlQuery0 = """
select
id,
user_id,
connected_user_id,
COALESCE(mg_id, 0) mg_id,
created_at,
updated_at,
updated_at,
access_level,
status,
COALESCE(nickname, '') nickname
from
apiData
"""
SQL_node1647619002820 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"apiData": JDBCConnection_node1},
transformation_ctx="SQL_node1647619002820",
)
# Script generated for node Custom transform
Customtransform_node1647612655336 = CastIntsTransform(
glueContext,
DynamicFrameCollection(
{"SQL_node1647619002820": SQL_node1647619002820}, glueContext
),
)
# Script generated for node Select From Collection
SelectFromCollection_node1647613332516 = SelectFromCollection.apply(
dfc=Customtransform_node1647612655336,
key=list(Customtransform_node1647612655336.keys())[0],
transformation_ctx="SelectFromCollection_node1647613332516",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=SelectFromCollection_node1647613332516,
mappings=[
("id", "bigint", "id", "bigint"),
("user_id", "bigint", "user_id", "bigint"),
("connected_user_id", "bigint", "connected_user_id", "bigint"),
("mg_id", "bigint", "mg_id", "bigint"),
("created_at", "timestamp", "created_at", "timestamp"),
("updated_at", "timestamp", "updated_at", "timestamp"),
("access_level", "tinyint", "access_level", "tinyint"),
("status", "tinyint", "status", "tinyint"),
("nickname", "varchar", "nickname", "varchar"),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node Amazon Redshift
pre_query = "drop table if exists public.stage_table_cd5d65739d334453938f090ea1cb2d6e;create table public.stage_table_cd5d65739d334453938f090ea1cb2d6e as select * from public.test_user_connections where 1=2;"
post_query = "begin;delete from public.test_user_connections using public.stage_table_cd5d65739d334453938f090ea1cb2d6e where public.stage_table_cd5d65739d334453938f090ea1cb2d6e.id = public.test_user_connections.id; insert into public.test_user_connections select * from public.stage_table_cd5d65739d334453938f090ea1cb2d6e; drop table public.stage_table_cd5d65739d334453938f090ea1cb2d6e; end;"
AmazonRedshift_node1647612972417 = glueContext.write_dynamic_frame.from_jdbc_conf(
frame=ApplyMapping_node2,
catalog_connection="ABC123",
connection_options={
"database": "test",
"dbtable": "public.stage_table_cd5d65739d334453938f090ea1cb2d6e",
"preactions": pre_query,
"postactions": post_query,
},
redshift_tmp_dir=args["TempDir"],
transformation_ctx="AmazonRedshift_node1647612972417",
)
job.commit()

Extract Embedded AWS Glue Connection Credentials Using Scala

I have a glue job that reads directly from redshift, and to do that, one has to provide connection credentials. I have created an embedded glue connection and can extract the credentials with the following pyspark code. Is there a way to do this in Scala?
glue = boto3.client('glue', region_name='us-east-1')
response = glue.get_connection(
Name='name-of-embedded-connection',
HidePassword=False
)
table = spark.read.format(
'com.databricks.spark.redshift'
).option(
'url',
'jdbc:redshift://prod.us-east-1.redshift.amazonaws.com:5439/db'
).option(
'user',
response['Connection']['ConnectionProperties']['USERNAME']
).option(
'password',
response['Connection']['ConnectionProperties']['PASSWORD']
).option(
'dbtable',
'db.table'
).option(
'tempdir',
's3://config/glue/temp/redshift/'
).option(
'forward_spark_s3_credentials', 'true'
).load()
There is no scala equivalent from AWS to issue this API call.But you can use Java SDK code inside scala as mentioned in this answer.
This is the Java SDK call for getConnection and if you don't want to do this then you can follow below approach:
Create AWS Glue python shell job and retrieve the connection information.
Once you have the values then call the other scala Glue job with these as arguments inside your python shell job as shown below :
glue = boto3.client('glue', region_name='us-east-1')
response = glue.get_connection(
Name='name-of-embedded-connection',
HidePassword=False
)
response = client.start_job_run(
JobName = 'my_scala_Job',
Arguments = {
'--username': response['Connection']['ConnectionProperties']['USERNAME'],
'--password': response['Connection']['ConnectionProperties']['PASSWORD'] } )
Then access these parameters inside your scala job using getResolvedOptions as shown below:
import com.amazonaws.services.glue.util.GlueArgParser
val args = GlueArgParser.getResolvedOptions(
sysArgs, Array(
"username",
"password")
)
val user = args("username")
val pwd = args("password")

Flink SQL CLI client CREATE TABLE from Kafka

I am trying to create a table in Apache Flink SQL client. I want to filter my JSON data in Flink, which arrives continously from a Kafka cluster.
The JSON looks like this:
{"lat":25.77,"lon":-80.19,"timezone":"America\/New_York",
"timezone_offset":-14400,
"current.dt":1592151550,
"current.sunrise":1592130546,
"current.sunset":1592179999,
"current.temp":302.77,
"current.feels_like":306.9,
"current.pressure":1017,
"current.humidity":78,
"current.dew_point":298.52,
"current.uvi":11.97,
"current.clouds":75,
"current.visibility":16093,
"current.wind_speed":3.6,
"current.wind_deg":60,
"current.weather.0.id":803,
"current.weather.0.main":"Clouds",
"current.weather.0.description":"broken clouds",
"current.weather.0.icon":"04d"}
The part I am interested in :
"current.weather.0.description":"broken clouds"
I want to filter my data whenever the current.weather description is "moderate rain". I tried to create two tables in Flink:
the Rain table, where the whole JSON arrives, and
where my filtered data will be stored and sent back to another Kafka cluster.
CREATE TABLE Rain (current.weather.0.description varchar) WITH ('connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'WeatherRawData',
'format.type' = 'json',
'connector.properties.0.key' = 'bootstrap.servers',
'connector.properties.0.value' = 'kafka:9092',
'connector.properties.1.key' = 'group.id',
'connector.properties.1.value' = 'flink-input-group',
'connector.startup-mode' = 'earliest-offset'
);
CREATE TABLE ProcessedRain(
current.weather.0.description varchar
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'WeatherProcessedData',
'format.type' = 'json',
'connector.properties.0.key' = 'bootstrap.servers',
'connector.properties.0.value' = 'kafka:9092',
'connector.properties.1.key' = 'group.id',
'connector.properties.1.value' = 'flink-output-group'
);
The error message I get :
[ERROR] Could not execute SQL statement. Reason: org.apache.flink.table.api.SqlParserException: SQL parse failed. Encountered "current" at line 1, column 20. Was expecting one of:
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
How should my CREATE TABLE be created correctly?
I think it should be
CREATE TABLE ProcessedRain (
`current.weather.0.description` VARCHAR
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'WeatherProcessedData',
'format.type' = 'json',
'connector.properties.bootstrap.servers' = 'kafka:9092',
'connector.properties.group.id' = 'flink-output-group'
);

Problem when use Spark SQL【2.1】 to work with PostgreSQL DB

I use following test case to write data to a postgresql table, and it works fine.
test("SparkSQLTest") {
val session = SparkSession.builder().master("local").appName("SparkSQLTest").getOrCreate()
val url = "jdbc:postgresql://dbhost:12345/db1"
val table = "schema1.table1"
val props = new Properties()
props.put("user", "user123")
props.put("password", "pass#123")
props.put(JDBCOptions.JDBC_DRIVER_CLASS, "org.postgresql.Driver")
session.range(300, 400).write.mode(SaveMode.Append).jdbc(url, table, props)
}
Then, I use following spark-sql -f sql_script_file.sql to write an hive data into postgresql table.
CREATE OR REPLACE TEMPORARY VIEW tmp_v1
USING org.apache.spark.sql.jdbc
OPTIONS (
driver 'org.postgresql.Driver',
url 'jdbc:postgresql://dbhost:12345/db1',
dbtable 'schema1.table2',
user 'user123',
password 'pass#123',
batchsize '2000'
);
insert into tmp_v1 select
name,
age
from test.person; ---test.person is the Hive db.table
But when I run the above script using spark-sql -f sql_script.sql, it complains that the postgresql user/passord is invalid, the exception is as follows, I think the above two methods are basically the same, so I would ask where the problem is, thanks.
org.postgresql.util.PSQLException: FATAL: Invalid username/password,login denied.
at org.postgresql.core.v3.ConnectionFactoryImpl.doAuthentication(ConnectionFactoryImpl.java:375)
at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:189)
at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:64)
at org.postgresql.jdbc2.AbstractJdbc2Connection.<init>(AbstractJdbc2Connection.java:124)
at org.postgresql.jdbc3.AbstractJdbc3Connection.<init>(AbstractJdbc3Connection.java:28)
at org.postgresql.jdbc3g.AbstractJdbc3gConnection.<init>(AbstractJdbc3gConnection.java:20)
at org.postgresql.jdbc4.AbstractJdbc4Connection.<init>(AbstractJdbc4Connection.java:30)
at org.postgresql.jdbc4.Jdbc4Connection.<init>(Jdbc4Connection.java:22)
at org.postgresql.Driver.makeConnection(Driver.java:392)
at org.postgresql.Driver.connect(Driver.java:266)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:59)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:50)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:330)
at org.apache.spark.sql.execution.datasources.CreateTempViewUsing.run(ddl.scala:76)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:59)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:75)

Exception in thread "main" java.sql.SQLSyntaxErrorException: ORA-00936: missing expression

While executing insert and update command in oracle 11g I'm getting below error.
val stmt = con.createStatement()
//Insert
val query1 = "insert into audit values('D','abc','T','01-NOV-18','Inprogress')"
stmt.executeUpdate(query1)
//Update
val query2 = "Update audit Set status='test' where where product= = 'D'"
stmt.executeUpdate(query2)
I'm getting below error
//Error while updating record
Exception in thread "main" java.sql.SQLSyntaxErrorException: ORA-00936: missing expression
at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:447)
at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:396)
at oracle.jdbc.driver.T4C8Oall.processError(T4C8Oall.java:951)
at oracle.jdbc.driver.T4CTTIfun.receive(T4CTTIfun.java:513)
at oracle.jdbc.driver.T4CTTIfun.doRPC(T4CTTIfun.java:227)
at oracle.jdbc.driver.T4C8Oall.doOALL(T4C8Oall.java:531)
at oracle.jdbc.driver.T4CStatement.doOall8(T4CStatement.java:195)
at oracle.jdbc.driver.T4CStatement.executeForRows(T4CStatement.java:1036)
at oracle.jdbc.driver.OracleStatement.doExecuteWithTimeout(OracleStatement.java:1336)
at oracle.jdbc.driver.OracleStatement.executeUpdateInternal(OracleStatement.java:1845)
at oracle.jdbc.driver.OracleStatement.executeUpdate(OracleStatement.java:1810)
at oracle.jdbc.driver.OracleStatementWrapper.executeUpdate(OracleStatementWrapper.java:294)
at com.oracle.OracleConnection$.main(OracleConnection.scala:21)
at com.oracle.OracleConnection.main(OracleConnection.scala)
Process finished with exit code 1
Any help will be appreciated. Thanks in advance
You have two where and two = in your update query.
val query2 = "Update audit Set status='test' where product='D'"