I have a docker container up and running in vs code. With pyspark I connect to a postgres database on my local machine:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.jars", "/opt/spark/jars/postgresql-42.2.5.jar") \
.getOrCreate()
df = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql://host.docker.internal:5432/postgres") \
.option("dbtable", "chicago_crime") \
.option("user", "postgres") \
.option("password", "postgres") \
.option("driver", "org.postgresql.Driver") \
.load()
type(df)
Output:
pyspark.sql.dataframe.DataFrame
Example code of what works:
df.printSchema()
df.select('ogc_fid').show() #(Raises a Py4JJavaError sometimes)
Example code of what does not work:
df.show(1) # Py4JJavaError and ConnectionRefusedError: [Errno 111] Connection refused
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
[... skipping hidden 1 frame]
Cell In[2], line 1
----> 1 df.show(1)
File /usr/local/lib/python3.9/site-packages/pyspark/sql/dataframe.py:606, in DataFrame.show(self, n, truncate, vertical)
605 if isinstance(truncate, bool) and truncate:
--> 606 print(self._jdf.showString(n, 20, vertical))
607 else:
File /usr/local/lib/python3.9/site-packages/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args)
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
File /usr/local/lib/python3.9/site-packages/pyspark/sql/utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
File /usr/local/lib/python3.9/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
325 if answer[1] == REFERENCE_TYPE:
...
--> 438 self.socket.connect((self.java_address, self.java_port))
439 self.stream = self.socket.makefile("rb")
440 self.is_connected = True
ConnectionRefusedError: [Errno 111] Connection refused
Anyone knows what this Py4JJavaError is? And how to overcome it?
PySpark is just a Wrapper around the actual implementation of Spark, which is written in Scala. Py4J enables you to communicate with the JVM process in Python.
That means the Py4JJavaError is only an abstraction, it tells you that the JVM process threw an Exception.
The real error is ConnectionRefusedError: [Errno 111] Connection refused.
I assume the error is caused while connecting to your Postgres instance.
Related
This is my code when run in pyspark env(version spark 3.1.2):
jdbcDF = spark.read \
.format("jdbc") \
.option("url", "jdbc:oracle:thin:#10.0.1.1:1521/sbank") \
.option("dbtable", "sa.a") \
.option("user", "g") \
.option("password", "zxc") \
.option("driver", "oracle.jdbc.driver.OracleDriver") \
.load()
But shows the announcement below as:
Py4JJavaError Traceback (most recent call last)
/tmp/ipykernel_29/4076487584.py in <module>
----> 1 jdbcDF = spark.read \
2 .format("jdbc") \
3 .option("url", "jdbc:oracle:thin:#10.0.1.1:1521/sbank") \
4 .option("dbtable", "sa.a") \
5 .option("user", "g") \
/usr/local/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
208 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
209 else:
--> 210 return self._df(self._jreader.load())
211
212 def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o137.load
Can anyone help me to solve that? Thank you in advance.
I add ojdbc11.jar into jars forder of spark
I have deployed pyspark 3.0.1 in Kubernetes.
I am using koalas in a jupyter notebook in order to perform some transformations and I need to write and read from Azure Database for PostgreSQL.
I can read it from pandas using the following code:
from sqlalchemy import create_engine
import psycopg2
import pandas
uri = 'postgres+psycopg2://<postgreuser>:<postgrepassword>#<server>:5432/<database>'
engine_azure = create_engine(uri, echo=False)
df = pdf.read_sql_query(f"select * from public.<table>", con=engine_azure)
I want to read this table from Pyspark using this code:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import databricks.koalas as ks
from s3fs import S3FileSystem
import datetime
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3,org.postgresql:postgresql:42.1.1 pyspark-shell pyspark-shell"
os.environ['PYSPARK_SUBMIT_ARGS2'] = "--packages org.postgresql:postgresql:42.1.1 pyspark-shell"
sparkClassPath = os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.postgresql:postgresql:42.1.1 pyspark-shell'
# Create Spark config for our Kubernetes based cluster manager
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "<image>")
sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.executor.instances", "3")
sparkConf.set("spark.executor.cores", "2")
sparkConf.set("spark.driver.memory", "2000m")
sparkConf.set("spark.executor.memory", "2000m")
sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
sparkConf.set("spark.driver.port", "29414")
sparkConf.set("spark.driver.host", "<deployment>.svc.cluster.local")
sparkConf.set("spark.driver.extraClassPath", sparkClassPath)
# Initialize our Spark cluster, this will actually
# generate the worker nodes.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext
df3 = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql://<host>:5432/<database>") \
.option("driver", "org.postgresql.Driver") \
.option("dbtable", "select * from public.<table>") \
.option("user", "<user>") \
.option("password", "<password>") \
.load()
But I receive this error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-5-a529178ed9a0> in <module>
1 url = 'jdbc:postgresql://psql-mcf-prod1.postgres.database.azure.com:5342/cpke-prod'
2 properties = {'user': 'adminmcfpsql#psql-mcf-prod1.postgres.database.azure.com', 'password': '4vb44B^V8w2D*q!eQZgl',"driver": "org.postgresql.Driver"}
----> 3 df3 = spark.read.jdbc(url=url, table='select * from public.userinput_write_offs where reversed_date is NULL', properties=properties)
/usr/local/spark/python/pyspark/sql/readwriter.py in jdbc(self, url, table, column, lowerBound, upperBound, numPartitions, predicates, properties)
629 jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)
630 return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
--> 631 return self._df(self._jreader.jdbc(url, table, jprop))
632
633
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
/usr/local/lib/python3.7/dist-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o89.jdbc.
: org.postgresql.util.PSQLException: The connection attempt failed.
at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:275)
at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:49)
at org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:194)
at org.postgresql.Driver.makeConnection(Driver.java:450)
at org.postgresql.Driver.connect(Driver.java:252)
at org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.connect(DriverWrapper.scala:45)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$createConnectionFactory$1(JdbcUtils.scala:64)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:56)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:226)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:35)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:344)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:221)
at org.apache.spark.sql.DataFrameReader.jdbc(DataFrameReader.scala:312)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: connect timed out
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:607)
at org.postgresql.core.PGStream.<init>(PGStream.java:68)
at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:144)
... 27 more
Your port number is incorrect - it should be 5432, not 5342. Therefore your connection timed out. If you change the line
.option("url", "jdbc:postgresql://<host>:5342/<database>")
to
.option("url", "jdbc:postgresql://<host>:5432/<database>")
maybe it will solve your problem.
I would like to put a Pyspark dataframe json_df into a totally empty postgres database (no schema and table). I use the following code but there is something wrong with the table selected in the option of the write statement. The error shows a driver issue but I have the driver updated so I assume it is just my code which is wrong. Any help would be appreciated.
My code:
database = "postgres"
jdbcUrl = f"jdbc:postgres://localhost:5432;databaseName={database}"
schema = StructType([
StructField('first_column', StringType(), True),
StructField('second_columns', StringType(), True),
])
df = sqlContext.createDataFrame(sc.emptyRDD(),schema)
json_df.select("first_column","second_column").write.format("jdbc") \
.mode("overwrite") \
.option("url", jdbcUrl) \
.option("user", user) \
.option("dbtable", df) \
.save()
The error:
Traceback (most recent call last):
File "etl.py", line 98, in <module>
.option("dbtable", df) \
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 825, in save
self._jwrite.save()
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o74.save.
: java.sql.SQLException: No suitable driver
Edit:
After changing the code to:
json_df.write.format("jdbc") \
.mode("overwrite") \
.option("url", jdbcUrl) \
.option("user", user) \
.option("dbtable", "news_schema.json_df") \
.save()
The error is:
File "etl.py", line 104, in <module>
.option("dbtable", "new_schema.json_df") \
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 825, in save
self._jwrite.save()
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o53.save.
: java.sql.SQLException: No suitable driver
According to https://jdbc.postgresql.org/documentation/head/connect.html, you should use postgresql instead of postgres, and no need to specify databaseName=
jdbcUrl = f"jdbc:postgresql://localhost:5432/{database}"
I was trying to connect to MongoDB Atlas from PySpark and I have the following problem:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
sc = SparkContext
spark = SparkSession.builder \
.config("spark.mongodb.input.uri", "mongodb+srv://#USER#:#PASS##test00-la3lt.mongodb.net/db.BUSQUEDAS?retryWrites=true") \
.config("spark.mongodb.output.uri", "mongodb+srv://#USER#:#PASS##test00-la3lt.mongodb.net/db.BUSQUEDAS?retryWrites=true") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
The error that returns this code is this:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-3-346df2de8d22> in <module>()
----> 1 df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
c:\users\andres\appdata\local\programs\python\python36\lib\site-packages\pyspark\sql\readwriter.py in load(self, path, format, schema, **options)
170 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
171 else:
--> 172 return self._df(self._jreader.load())
173
174 #since(1.4)
c:\users\andres\appdata\local\programs\python\python36\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
c:\users\andres\appdata\local\programs\python\python36\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
c:\users\andres\appdata\local\programs\python\python36\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o34.load.
: java.lang.NoClassDefFoundError: com/mongodb/client/model/Collation
at com.mongodb.spark.config.ReadConfig$.<init>(ReadConfig.scala:50)
at com.mongodb.spark.config.ReadConfig$.<clinit>(ReadConfig.scala)
at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:67)
at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:50)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:340)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:164)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.mongodb.client.model.Collation
How I can solve this problem?
Is a problem with the code or with the references?
In the pyspark config file, I have this:
./bin/pyspark --conf "spark.mongodb.input.uri=mongodb+srv://#USER#:#PASS##test00-la3lt.mongodb.net/db.BUSQUEDAS?readPreference=primaryPreferred" \
--conf "spark.mongodb.output.uri=mongodb+srv://#USER#:#PASS##test00-la3lt.mongodb.net/db.BUSQUEDAS" \
--packages org.mongodb.spark:mongo-spark-connector_2.11:2.1.3
The version of Spark is 2.3.1 and Scala 2.11.8
The problem of this error is because is necesari add this references:
https://oss.sonatype.org/content/repositories/releases/org/mongodb/mongodb-driver/3.8.1/
https://oss.sonatype.org/content/repositories/releases/org/mongodb/mongodb-driver-core/3.8.1/ https://oss.sonatype.org/content/repositories/releases/org/mongodb/bson/3.8.1/
When I add this, the problem is solved
How / where do I install the jdbc drivers for spark sql? I'm running the all-spark-notebook docker image, and am trying to pull some data directly from a sql database into spark.
From what I can tell I can tell I need to include the drivers in my Classpath, I'm just not sure how to do that from pyspark?
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.master("local") \
.appName("Python Spark SQL basic example") \
.getOrCreate()
jdbcDF = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql:dbserver") \
.option("dbtable", "jdbc:postgresql:dbserver") \
.load()
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-2-f3b08ff6d117> in <module>()
2 spark = SparkSession .builder .master("local") .appName("Python Spark SQL basic example") .getOrCreate()
3
----> 4 jdbcDF = spark.read .format("jdbc") .option("url", "jdbc:postgresql:dbserver") .option("dbtable", "jdbc:postgresql:dbserver") .load()
/usr/local/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
163 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
164 else:
--> 165 return self._df(self._jreader.load())
166
167 #since(1.4)
/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o36.load.
: java.sql.SQLException: No suitable driver
at java.sql.DriverManager.getDriver(DriverManager.java:315)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:83)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:34)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
In order to include the driver for postgresql you can do the following:
from pyspark.conf import SparkConf
conf = SparkConf() # create the configuration
conf.set("spark.jars", "/path/to/postgresql-connector-java-someversion-bin.jar") # set the spark.jars
...
spark = SparkSession.builder \
.config(conf=conf) \ # feed it to the session here
.master("local") \
.appName("Python Spark SQL basic example") \
.getOrCreate()
Now, since you are using Docker, I guess you have to mount the folder that has the driver jar and refer to the mounted folder. (e.g.: How to mount a host directory in a Docker container)
Hope this helps, good luck!
Edit: A diffferent way would be to give the --driver-class-path argument when using spark-submit like this:
spark-submit --driver-class-path=path/to/postgresql-connector-java-someversion-bin.jar file_to_run.py
but I'm guessing this is not how you will run this.
Putting the driver into the pyspark path works but the correct way to do it is to add something line this:
conf = pyspark.SparkConf().setAll([('spark.executor.id', 'driver'),
('spark.app.id', 'local-1631738601802'),
('spark.app.name', 'PySparkShell'),
('spark.driver.port', '32877'),
('spark.sql.warehouse.dir', 'file:/home/data_analysis_tool/spark-warehouse'),
('spark.driver.host', 'localhost'),
('spark.sql.catalogImplementation', 'hive'),
('spark.rdd.compress', 'True'),
('spark.driver.bindAddress', 'localhost'),
('spark.serializer.objectStreamReset', '100'),
('spark.master', 'local[*]'),
('spark.submit.pyFiles', ''),
('spark.app.startTime', '1631738600836'),
('spark.submit.deployMode', 'client'),
('spark.ui.showConsoleProgress', 'true'),
('spark.driver.extraClassPath','/tmp/postgresql-42.2.23.jar')])
note the line:
('spark.driver.extraClassPath','/tmp/postgresql-42.2.23.jar')
Here is the whole code:
import psycopg2
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from sqlalchemy import create_engine
import qgrid
#appName = "PySpark PostgreSQL Example - via psycopg2"
#master = "local"
#spark = SparkSession.builder.master(master).appName(appName).getOrCreate()
conf = pyspark.SparkConf().setAll([('spark.executor.id', 'driver'),
('spark.app.id', 'local-1631738601802'),
('spark.app.name', 'PySparkShell'),
('spark.driver.port', '32877'),
('spark.sql.warehouse.dir', 'file:/home/data_analysis_tool/spark-warehouse'),
('spark.driver.host', 'localhost'),
('spark.sql.catalogImplementation', 'hive'),
('spark.rdd.compress', 'True'),
('spark.driver.bindAddress', 'localhost'),
('spark.serializer.objectStreamReset', '100'),
('spark.master', 'local[*]'),
('spark.submit.pyFiles', ''),
('spark.app.startTime', '1631738600836'),
('spark.submit.deployMode', 'client'),
('spark.ui.showConsoleProgress', 'true'),
('spark.driver.extraClassPath','/tmp/postgresql-42.2.23.jar')])
sc = pyspark.SparkContext(conf=conf)
sc.getConf().getAll()
sparkSession = SparkSession (sc)
sparkDataFrame = sparkSession.read.format("jdbc") \
.options(
url="jdbc:postgresql://localhost:5432/Database",
dbtable="test_features_3",
user="database_user",
password="Pa$$word").load()
print (sparkDataFrame.count())
sc.stop()