Connecting pyspark cluster to Cassandra cluster ERROR o64.load - pyspark

I am trying to connect my PySpark cluster to Cassandra cluster. I did the following to set the connector from Spark to Cassandra:
./bin/spark-submit --packages com.datastax.spark:spark-cassandra-connector_2.10:1.5.0-M2 ./exaples/testing.py
I set the following in my python file:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
SPARK_IP = "ip-111-11-1-1.us-west-2.compute.internal"
SPARK_PORT = "7077"
CASSANDRA_PORT = "222.22.2.22"
conf = SparkConf() \
.setMaster("spark://%s:%s" % (SPARK_IP, SPARK_PORT)) \
.set("spark.cassandra.connection.host", CASSANDRA_PORT)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
In my Cassandra cluster I created a keyspace and a table. I then try to read from Cassandra in pyspark and do the following:
sqlContext.read \
.format("org.apache.spark.sql.cassandra") \
.options(table="poop", keyspace="demo") \
.load().show()
I get the following error and I'm not sure how to fix this:
Traceback (most recent call last):
File "/usr/local/spark/examples/testing.py", line 37, in
.options(table="poop", keyspace="demo") \
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 155, in load
File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in call
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o64.load.
: java.lang.ClassNotFoundException: Failed to find data source: org.apache.spark.sql.cassandra. Please find packages at http://spark.apache.org/third-party-projects.html

Related

Fill an empty postgres database in Pyspark

I would like to put a Pyspark dataframe json_df into a totally empty postgres database (no schema and table). I use the following code but there is something wrong with the table selected in the option of the write statement. The error shows a driver issue but I have the driver updated so I assume it is just my code which is wrong. Any help would be appreciated.
My code:
database = "postgres"
jdbcUrl = f"jdbc:postgres://localhost:5432;databaseName={database}"
schema = StructType([
StructField('first_column', StringType(), True),
StructField('second_columns', StringType(), True),
])
df = sqlContext.createDataFrame(sc.emptyRDD(),schema)
json_df.select("first_column","second_column").write.format("jdbc") \
.mode("overwrite") \
.option("url", jdbcUrl) \
.option("user", user) \
.option("dbtable", df) \
.save()
The error:
Traceback (most recent call last):
File "etl.py", line 98, in <module>
.option("dbtable", df) \
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 825, in save
self._jwrite.save()
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o74.save.
: java.sql.SQLException: No suitable driver
Edit:
After changing the code to:
json_df.write.format("jdbc") \
.mode("overwrite") \
.option("url", jdbcUrl) \
.option("user", user) \
.option("dbtable", "news_schema.json_df") \
.save()
The error is:
File "etl.py", line 104, in <module>
.option("dbtable", "new_schema.json_df") \
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 825, in save
self._jwrite.save()
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/home/ubuntu/.local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o53.save.
: java.sql.SQLException: No suitable driver
According to https://jdbc.postgresql.org/documentation/head/connect.html, you should use postgresql instead of postgres, and no need to specify databaseName=
jdbcUrl = f"jdbc:postgresql://localhost:5432/{database}"

Create pyspark dataframe from parquet file

I am quite new in pyspark and I am still trying to figure out who things work. What I am trying to do is after loading a parquet file in memory using pyarrow Itry to make it to pyspark dataframe. But I am getting an error.
I should mention that I am not reading directly through pyspark because the file in in s3 which gives me another error about "no filesystem for scheme s3"
so I am trying to work around. Below I have a reproducible example.
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()
parquet_file=pq.ParquetDataset('s3filepath.parquet',filesystem=s3)
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
spark.createDataFrame(parquet_file)
------------------------------------------------------------------
TypeError Traceback (most recent
call last)
<ipython-input-20-0cb2dd287606> in <module>
----> 1 spark.createDataFrame(pandas_dataframe)
/usr/local/spark/python/pyspark/sql/session.py in
createDataFrame(self, data, schema, samplingRatio, verifySchema)
746 rdd, schema =
self._createFromRDD(data.map(prepare), schema, samplingRatio)
747 else:
--> 748 rdd, schema =
self._createFromLocal(map(prepare, data), schema)
749 jrdd =
self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
750 jdf =
self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(),
schema.json())
TypeError: 'ParquetDataset' object is not iterable
import pyspark
from pyspark.sql import SQLContext
sc = pyspark.SparkContext('local', "retail")
sqlC = SQLContext(sc)
This is how you should read parquet files to spark df:
df = sqlC.read.parquet('path_to_file_or_dir')
You can read data from S3 via Spark as long as you have the public and secret keys for the S3 bucket ... this would be more efficient compared to going though arrow via pandas and then converting to spark dataframe because you would have to parallelize the serial read.
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", ACCESS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", SECRET_KEY)
df = spark.read.parquet("s3://path/to/parquet/files")
source doc => https://docs.databricks.com/spark/latest/data-sources/aws/amazon-s3.html#access-aws-s3-directly

Error reading.writing from phoenix using pyspark

I am trying to put together a data pipeline on HDP 2.6.3 sandbox.(docker) I am using pyspark with phoenix (4.7) and HBase.
I have installed phoenix project from maven and successfully created a table with test records. I can see data in Hbase as well.
Now i am trying to read data from the table using pyspark with the following code:
import phoenix
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext(appName="Phoenix test")
sqlContext = SQLContext(sc)
table = sqlContext.read.format("org.apache.phoenix.spark").option("table", "INPUT_TABLE").option("zkUrl", "localhost:2181:/hbase-unsecure").load()
phoenix ddl:
CREATE TABLE INPUT_TABLE (id BIGINT NOT NULL PRIMARY KEY, col1 VARCHAR, col2 INTEGER);
UPSERT INTO INPUT_TABLE (id, col1, col2) VALUES (1, 'test_row_1',111);
UPSERT INTO INPUT_TABLE (id, col1, col2) VALUES (2, 'test_row_2',111 );
call:
spark-submit --class org.apache.phoenix.spark --jars /usr/hdp/current/phoenix-server/phoenix-4.7.0.2.5.0.0-1245-client.jar --repositories http://repo.hortonworks.com/content/groups/public/ --files /etc/spark2/conf/hbase-site.xml phoenix_test.py
Traceback (most recent call last):
File "/root/hdp/process_data.py", line 42, in
.format(data_source_format)\
File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 593, in save
File "/usr/lib/python2.6/site-packages/py4j-0.10.6-py2.6.egg/py4j/java_gateway.py", line 1160, in call
answer, self.gateway_client, self.target_id, self.name)
File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/lib/python2.6/site-packages/py4j-0.10.6-py2.6.egg/py4j/protocol.py", line 320, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o55.save.
: java.lang.UnsupportedOperationException: empty.tail
thanks,
clairvoyant

Getting error while using collect_list function with struct datatype in Spark 1.6.0

While executing below statement I am getting error in Spark 1.6.0. grouped_df statement is not working for me.
from pyspark.sql import functions as F
from pyspark import SQLContext
data = [[1,'2014-01-03', 10],[1,'2014-01-04', 5],[1,'2014-01-05', 15],[1,'2014-01-06' , 20],[2,'2014-02-10', 100],[2,'2014-03-11', 500],[2,'2014-04-15',1500]]
df = sc.parallelize(data).toDF(['id','date','value'])
df.show()
grouped_df = df.groupby("id").agg(F.collect_list(F.struct("date", "value")).alias("list_col"))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/taxgard/CPWorkArea/agarwal/python/spark/spark-1.6/python/pyspark/sql/group.py", line 91, in agg
_to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
File "/opt/taxgard/CPWorkArea/agarwal/python/spark/spark-1.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
File "/opt/taxgard/CPWorkArea/agarwal/python/spark/spark-1.6/python/pyspark/sql/utils.py", line 51, in deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u'No handler for Hive udf class org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectList because: Only primitive type arguments are accepted but struct<date:string,value:bigint> was passed as parameter 1..;'
You have to use HiveContext instead of SQLContext
from pyspark import SparkContext, HiveContext
sc = SparkContext(appName='my app name')
sql_cntx = HiveContext(sc)
data = [[1,'2014-01-03', 10],[1,'2014-01-04', 5],[1,'2014-01-05', 15],[1,'2014-01-06' , 20],[2,'2014-02-10', 100],[2,'2014-03-11', 500],[2,'2014-04-15',1500]]
rdd = sc.parallelize(data)
df = sql_cntx.createDataFrame(rdd, ['id','date','value'])
# ...

How to add jdbc drivers to classpath when using PySpark?

How / where do I install the jdbc drivers for spark sql? I'm running the all-spark-notebook docker image, and am trying to pull some data directly from a sql database into spark.
From what I can tell I can tell I need to include the drivers in my Classpath, I'm just not sure how to do that from pyspark?
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.master("local") \
.appName("Python Spark SQL basic example") \
.getOrCreate()
jdbcDF = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql:dbserver") \
.option("dbtable", "jdbc:postgresql:dbserver") \
.load()
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-2-f3b08ff6d117> in <module>()
2 spark = SparkSession .builder .master("local") .appName("Python Spark SQL basic example") .getOrCreate()
3
----> 4 jdbcDF = spark.read .format("jdbc") .option("url", "jdbc:postgresql:dbserver") .option("dbtable", "jdbc:postgresql:dbserver") .load()
/usr/local/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
163 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
164 else:
--> 165 return self._df(self._jreader.load())
166
167 #since(1.4)
/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o36.load.
: java.sql.SQLException: No suitable driver
at java.sql.DriverManager.getDriver(DriverManager.java:315)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:83)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:34)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
In order to include the driver for postgresql you can do the following:
from pyspark.conf import SparkConf
conf = SparkConf() # create the configuration
conf.set("spark.jars", "/path/to/postgresql-connector-java-someversion-bin.jar") # set the spark.jars
...
spark = SparkSession.builder \
.config(conf=conf) \ # feed it to the session here
.master("local") \
.appName("Python Spark SQL basic example") \
.getOrCreate()
Now, since you are using Docker, I guess you have to mount the folder that has the driver jar and refer to the mounted folder. (e.g.: How to mount a host directory in a Docker container)
Hope this helps, good luck!
Edit: A diffferent way would be to give the --driver-class-path argument when using spark-submit like this:
spark-submit --driver-class-path=path/to/postgresql-connector-java-someversion-bin.jar file_to_run.py
but I'm guessing this is not how you will run this.
Putting the driver into the pyspark path works but the correct way to do it is to add something line this:
conf = pyspark.SparkConf().setAll([('spark.executor.id', 'driver'),
('spark.app.id', 'local-1631738601802'),
('spark.app.name', 'PySparkShell'),
('spark.driver.port', '32877'),
('spark.sql.warehouse.dir', 'file:/home/data_analysis_tool/spark-warehouse'),
('spark.driver.host', 'localhost'),
('spark.sql.catalogImplementation', 'hive'),
('spark.rdd.compress', 'True'),
('spark.driver.bindAddress', 'localhost'),
('spark.serializer.objectStreamReset', '100'),
('spark.master', 'local[*]'),
('spark.submit.pyFiles', ''),
('spark.app.startTime', '1631738600836'),
('spark.submit.deployMode', 'client'),
('spark.ui.showConsoleProgress', 'true'),
('spark.driver.extraClassPath','/tmp/postgresql-42.2.23.jar')])
note the line:
('spark.driver.extraClassPath','/tmp/postgresql-42.2.23.jar')
Here is the whole code:
import psycopg2
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from sqlalchemy import create_engine
import qgrid
#appName = "PySpark PostgreSQL Example - via psycopg2"
#master = "local"
#spark = SparkSession.builder.master(master).appName(appName).getOrCreate()
conf = pyspark.SparkConf().setAll([('spark.executor.id', 'driver'),
('spark.app.id', 'local-1631738601802'),
('spark.app.name', 'PySparkShell'),
('spark.driver.port', '32877'),
('spark.sql.warehouse.dir', 'file:/home/data_analysis_tool/spark-warehouse'),
('spark.driver.host', 'localhost'),
('spark.sql.catalogImplementation', 'hive'),
('spark.rdd.compress', 'True'),
('spark.driver.bindAddress', 'localhost'),
('spark.serializer.objectStreamReset', '100'),
('spark.master', 'local[*]'),
('spark.submit.pyFiles', ''),
('spark.app.startTime', '1631738600836'),
('spark.submit.deployMode', 'client'),
('spark.ui.showConsoleProgress', 'true'),
('spark.driver.extraClassPath','/tmp/postgresql-42.2.23.jar')])
sc = pyspark.SparkContext(conf=conf)
sc.getConf().getAll()
sparkSession = SparkSession (sc)
sparkDataFrame = sparkSession.read.format("jdbc") \
.options(
url="jdbc:postgresql://localhost:5432/Database",
dbtable="test_features_3",
user="database_user",
password="Pa$$word").load()
print (sparkDataFrame.count())
sc.stop()