Pyspark to connect to postgresql - postgresql

I'm trying to connect to a Posgresql database in Databricks and I'm using the following code:
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.jars", "path_to_postgresDriver/postgresql_42_2_14.jar").getOrCreate()
driver = "org.postgresql.Driver"
database_host = "my_host"
database_port = "5432"
database_name = "db_name"
table = "db_table"
user = "my_user"
password = "my_pwd"
url = f"jdbc:postgresql://{database_host}:{database_port}/{database_name}"
remote_table = (spark.read
.format("jdbc")
.option("driver", driver)
.option("url", url)
.option("dbtable", table)
.option("user", user)
.option("password", password)
.load()
)
Unfortunately I get back the following error:
Any idea what might be happening?
Thank you in advance!

Related

pyspark publish google-pubsublite

the code is from google doc.
https://cloud.google.com/pubsub/lite/docs/write-messages-apache-spark
I m trying to publish to a pubsulite topic from pyspark. .
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, create_map, col, lit, when
from pyspark.sql.types import BinaryType, StringType
import uuid
# TODO(developer):
project_number = xxx
location = "us-central1"
topic_id = "kosmin"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.google.cloud:pubsublite-spark-sql-streaming:0.4.1,com.google.cloud:google-cloud-pubsublite:1.6.1 pyspark-shell'
spark = SparkSession.builder.appName("write-app").getOrCreate()
sdf = (
sdf.withColumn("key", lit("example").cast(BinaryType()))
.withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
.withColumnRenamed("timestamp", "event_timestamp")
# Populate the attributes field. For example, an even value will
# have {"key1", [b"even"]}.
.withColumn(
"attributes",
create_map(
lit("key1"),
array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
),
)
.drop("value")
)
query = (
sdf.writeStream.format("pubsublite")
.option(
"pubsublite.topic",
f"projects/{project_number}/locations/{location}/topics/{topic_id}",
)
# Required. Use a unique checkpoint location for each job.
.option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex)
.outputMode("append")
# .trigger(processingTime="1 second")
.start()
)
# Wait 60 seconds to terminate the query.
query.awaitTermination(60)
query.stop()
but i m getting
22/07/29 19:09:38 ERROR Utils: Aborting task
com.google.api.gax.rpc.ApiException:
at com.google.cloud.pubsublite.internal.CheckedApiException.<init>(CheckedApiException.java:51)
at com.google.cloud.pubsublite.internal.CheckedApiException.<init>(CheckedApiException.java:55)
at com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical(ExtractStatus.java:53)
at com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions.newServiceClient(PslWriteDataSourceOptions.java:131)
......
Caused by: java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;CLjava/lang/Object;)V
at io.grpc.Metadata$Key.validateName(Metadata.java:754)
at io.grpc.Metadata$Key.<init>(Metadata.java:762)
at io.grpc.Metadata$Key.<init>(Metadata.java:671)
at io.grpc.Metadata$AsciiKey.<init>(Metadata.java:971)
at io.grpc.Metadata$AsciiKey.<init>(Metadata.java:966)
at io.grpc.Metadata$Key.of(Metadata.java:708)
at io.grpc.Metadata$Key.of(Metadata.java:704)
at com.google.api.gax.grpc.GrpcHeaderInterceptor.<init>(GrpcHeaderInterceptor.java:60)
....
what am I missing ? the credentials are set, maybe some package at submit time ?
worked with
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars=pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar pyspark-shell'

Spark stream stops abruptly - "the specified path does not exist"

I am working on the spark structure streaming. My Stream works fine but after sometime it just stops because of below issue.
Any suggestion what could be the reason and how to resolve this issue.
java.io.FileNotFoundException: Operation failed: "The specified path does not exist.", 404, GET, https://XXXXXXXX.dfs.core.windows.net/output?upn=false&resource=filesystem&maxResults=5000&directory=XXXXXXXX&timeout=90&recursive=true, PathNotFound, "The specified path does not exist. RequestId:d1b7c77f-e01f-0027-7f09-4646f7000000 Time:2022-04-01T20:47:30.1791444Z"
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.checkException(AzureBlobFileSystem.java:1290)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.listKeysWithPrefix(AzureBlobFileSystem.java:530)
at com.databricks.tahoe.store.EnhancedAzureBlobFileSystemUpgrade.listKeysWithPrefix(EnhancedFileSystem.scala:605)
at com.databricks.tahoe.store.EnhancedDatabricksFileSystemV2.$anonfun$listKeysWithPrefix$1(EnhancedFileSystem.scala:374)
at com.databricks.backend.daemon.data.client.DBFSV2.$anonfun$listKeysWithPrefix$1(DatabricksFileSystemV2.scala:247)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:395)
at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:484)
at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:504)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:266)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:261)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:258)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.withAttributionContext(DatabricksFileSystemV2.scala:510)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:305)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:297)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.withAttributionTags(DatabricksFileSystemV2.scala:510)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:479)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:404)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.recordOperationWithResultTags(DatabricksFileSystemV2.scala:510)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:395)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:367)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.recordOperation(DatabricksFileSystemV2.scala:510)
at com.databricks.backend.daemon.data.client.DBFSV2.listKeysWithPrefix(DatabricksFileSystemV2.scala:240)
at com.databricks.tahoe.store.EnhancedDatabricksFileSystemV2.listKeysWithPrefix(EnhancedFileSystem.scala:374)
at com.databricks.tahoe.store.AzureLogStore.listKeysWithPrefix(AzureLogStore.scala:54)
at com.databricks.tahoe.store.DelegatingLogStore.listKeysWithPrefix(DelegatingLogStore.scala:251)
at com.databricks.sql.fileNotification.autoIngest.FileEventBackfiller$.listFiles(FileEventWorkerThread.scala:967)
at com.databricks.sql.fileNotification.autoIngest.FileEventBackfiller.runInternal(FileEventWorkerThread.scala:876)
at com.databricks.sql.fileNotification.autoIngest.FileEventBackfiller.run(FileEventWorkerThread.scala:809)
Caused by: Operation failed: "The specified path does not exist.", 404, GET, https://XXXXXXXXXX.dfs.core.windows.net/output?upn=false&resource=filesystem&maxResults=5000&directory=XXXXXXXX&timeout=90&recursive=true, PathNotFound, "The specified path does not exist. RequestId:02ae07cf-901f-0001-080e-46dd43000000 Time:2022-04-01T21:21:40.2136657Z"
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsRestOperation.execute(AbfsRestOperation.java:241)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsClient.listPath(AbfsClient.java:235)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.listFiles(AzureBlobFileSystemStore.java:1112)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.access$200(AzureBlobFileSystemStore.java:143)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore$1.fetchMoreResults(AzureBlobFileSystemStore.java:1052)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore$1.(AzureBlobFileSystemStore.java:1033)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.listKeysWithPrefix(AzureBlobFileSystemStore.java:1029)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.listKeysWithPrefix(AzureBlobFileSystem.java:527)
... 27 more
Below is my code:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StringType
from pyspark.sql import functions as F
from delta.tables import *
spark.sql("set spark.sql.files.ignoreMissingFiles=true")
filteredRawDF = ""
try:
filteredRawDF = spark.readStream.format("cloudFiles") \
.option("cloudFiles.format", "json") \
.option("cloudFiles.schemaLocation", landingcheckPointFilePath) \
.option("cloudFiles.inferColumnTypes", "true") \
.load(landingFilePath) \
.select(from_json('body', schema).alias('temp')) \
.select(explode("temp.report.data").alias("details")) \
.select("details",
explode("details.breakdown").alias("inner_breakdown")) \
.select("details","inner_breakdown",
explode("inner_breakdown.breakdown").alias("outer_breakdown"))\
.select(to_timestamp(col("details.name"), "yyyy-MM-
dd'T'HH:mm:ss+SSSS").alias('datetime'),
col("details.year"),
col("details.day"),
col("details.hour"),
col("details.minute"),
col("inner_breakdown.name").alias("hotelName"),
col("outer_breakdown.name").alias("checkindate"),
col("outer_breakdown.counts")[0].cast("int").alias("HdpHits"))
except Exception as e:
print(e)
query = filteredRawDF \
.writeStream \
.format("delta") \
.option("mergeSchema", "true") \
.outputMode("append") \
.option("checkpointLocation", checkPointPath) \
.trigger(processingTime='50 seconds') \
.start(savePath) '''
Thanks

Mongodb connection url not working in AWS Glue Connection screen

I am trying to connect to mongodb from AWS glue connection screen and getting error
tried various url's
mongodb://server1:2066,server2:2066,server3:2066/dbname.collection?authMechanism=PLAIN&authSource=&external
mongodb://server1:2066,server2:2066,server3:2066/?authMechanism=PLAIN&authSource=&external
Error in AWS Glue connection output in cloud watch
Check that your connection definition references your Mongo database with correct URL syntax, username, and password.
Exiting with error code 30
It works in Glue job with same VPC/Subnet/Security groups
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext, SparkConf
from awsglue.context import GlueContext
from awsglue.job import Job
import time
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
mongo_uri = "mongodb://_SERVER_1:2066,_SERVER_2:2066,_SERVER_3:2066/?authMechanism=PLAIN&authSource=$external"
read_mongo_options = {
"uri": mongo_uri,
"database": "_DB_",
#"authMechanism": "PLAIN",
#"authSource": "$external",
"collection": "_collection_",
"username": "****",
"password": "***",
"partitioner": "MongoSamplePartitioner",
"partitionerOptions.partitionSizeMB": "10",
"partitionerOptions.partitionKey": "_id",
"inferSchema": "false"
}
logger = glueContext.get_logger()
logger.info("Connecting...")
s3_location = 's3://_bucket_/output/'
dynamic_frame = glueContext.create_dynamic_frame.from_options(connection_type="mongodb",
connection_options=read_mongo_options)
print("Connected!")

How to convert this code to scala (using adal to generate Azure AD token)

I am currently working on a Scala code that can establish a connection to an SQL server database using AD token.
There are too little documentation on the subject online so I tries to work on it using python. Now it is working, I am looking to convert my code to Scala.
Here is the python script:
context = adal.AuthenticationContext(AUTHORITY_URL)
token = context.acquire_token_with_client_credentials("https://database.windows.net/", CLIENT_ID, CLIENT_SECRET)
access_token = token["accessToken"]
df= spark.read \
.format("com.microsoft.sqlserver.jdbc.spark") \
.option("url", URL) \
.option("dbtable", "tab1") \
.option("accessToken", access_token) \
.option("hostNameInCertificate", "*.database.windows.net") \
.load()
df.show()
Here is the Java code that you can use as a base, using the acquireToken function:
import com.microsoft.aad.adal4j.AuthenticationContext;
import com.microsoft.aad.adal4j.AuthenticationResult;
import com.microsoft.aad.adal4j.ClientCredential;
...
String authority = "https://login.microsoftonline.com/<org-uuid>;
ExecutorService service = Executors.newFixedThreadPool(1);
AuthenticationContext context = new AuthenticationContext(authority, true, service);
ClientCredential credential = new ClientCredential("sp-client-id", "sp-client-secret");
AuthenticationResult result = context.acquireToken("resource_id", credential, null).get();
// get token
String token = result.getAccessToken()
P.S. But really, ADAL's usage isn't recommended anymore, it's better to use MSAL instead (here is the migration guide)

Problem when use Spark SQL【2.1】 to work with PostgreSQL DB

I use following test case to write data to a postgresql table, and it works fine.
test("SparkSQLTest") {
val session = SparkSession.builder().master("local").appName("SparkSQLTest").getOrCreate()
val url = "jdbc:postgresql://dbhost:12345/db1"
val table = "schema1.table1"
val props = new Properties()
props.put("user", "user123")
props.put("password", "pass#123")
props.put(JDBCOptions.JDBC_DRIVER_CLASS, "org.postgresql.Driver")
session.range(300, 400).write.mode(SaveMode.Append).jdbc(url, table, props)
}
Then, I use following spark-sql -f sql_script_file.sql to write an hive data into postgresql table.
CREATE OR REPLACE TEMPORARY VIEW tmp_v1
USING org.apache.spark.sql.jdbc
OPTIONS (
driver 'org.postgresql.Driver',
url 'jdbc:postgresql://dbhost:12345/db1',
dbtable 'schema1.table2',
user 'user123',
password 'pass#123',
batchsize '2000'
);
insert into tmp_v1 select
name,
age
from test.person; ---test.person is the Hive db.table
But when I run the above script using spark-sql -f sql_script.sql, it complains that the postgresql user/passord is invalid, the exception is as follows, I think the above two methods are basically the same, so I would ask where the problem is, thanks.
org.postgresql.util.PSQLException: FATAL: Invalid username/password,login denied.
at org.postgresql.core.v3.ConnectionFactoryImpl.doAuthentication(ConnectionFactoryImpl.java:375)
at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:189)
at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:64)
at org.postgresql.jdbc2.AbstractJdbc2Connection.<init>(AbstractJdbc2Connection.java:124)
at org.postgresql.jdbc3.AbstractJdbc3Connection.<init>(AbstractJdbc3Connection.java:28)
at org.postgresql.jdbc3g.AbstractJdbc3gConnection.<init>(AbstractJdbc3gConnection.java:20)
at org.postgresql.jdbc4.AbstractJdbc4Connection.<init>(AbstractJdbc4Connection.java:30)
at org.postgresql.jdbc4.Jdbc4Connection.<init>(Jdbc4Connection.java:22)
at org.postgresql.Driver.makeConnection(Driver.java:392)
at org.postgresql.Driver.connect(Driver.java:266)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:59)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:50)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:330)
at org.apache.spark.sql.execution.datasources.CreateTempViewUsing.run(ddl.scala:76)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:59)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:75)