Write PySpark 2.0.1 DataFrame as PostgreSQL table: UnicodeEncodeError - postgresql

I've got the following error when trying to write a spark DataFrame as a PostgreSQL table:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-121-159b38b4c333> in <module>()
7 "password":"112211234",
8 "driver":"org.postgresql.Driver",
----> 9 "client_encoding":"utf8"
10 }
11 )
/home/ec2-user/spark-2.0.1-bin-hadoop2.6/python/pyspark/sql/readwriter.pyc in jdbc(self, url, table, mode, properties)
760 for k in properties:
761 jprop.setProperty(k, properties[k])
--> 762 self._jwrite.mode(mode).jdbc(url, table, jprop)
763
764
/home/ec2-user/spark-2.0.1-bin-hadoop2.6/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/home/ec2-user/spark-2.0.1-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/home/ec2-user/spark-2.0.1-bin-hadoop2.6/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
<type 'str'>: (<type 'exceptions.UnicodeEncodeError'>, UnicodeEncodeError('ascii', u'An error occurred while calling o3418.jdbc.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 105.0 failed 4 times, most recent failure: Lost task 5.3 in stage 105.0 (TID 1937, 10.0.0.52): org.postgresql.util.PSQLException: \u041f\u043e\u0434\u0441\u043e\u0435\u0434\u0438\u043d\u0435\u043d\u0438\u0435 \u043f\u043e \u0430\u0434\u0440\u0435\u0441\u0443 localhost:5432 \u043e\u0442\u043a\u043b\u043e\u043d\u0435\u043d\u043e. \u041f\u0440\u043e\u0432\u0435\u0440\u044c\u0442\u0435 \u0447\u0442\u043e \u0445\u043e\u0441\u0442 \u0438 \u043f\u043e\u0440\u0442 \u0443\u043a\u0430\u0437\u0430\u043d\u044b \u043f\u0440\u0430\u0432\u0438\u043b\u044c\u043d\u043e \u0438 \u0447\u0442\u043e postmaster \u043f\u0440\u0438\u043d\u0438\u043c\u0430\u0435\u0442 TCP/IP-\u043f\u043e\u0434\u0441\u043e\u0435\u0434\u0438\u043d\u0435\u043d\u0438\u044f.\n\tat org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:262)\n\tat org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:52)\n\tat org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:216)\n\tat org.postgresql.Driver.makeConnection(Driver.java:404)\n\tat org.postgresql.Driver.connect(Driver.java:272)\n\tat org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.connect(DriverWrapper.scala:45)
The DataFrame is the following:
from pyspark.sql import SQLContext, Row, DataFrame, SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("test") \
.config("spark.some.config.option", "test") \
.getOrCreate()
fields = [
StructField("id", IntegerType(), True),
StructField("name", StringType(), True),
StructField("age", IntegerType(), True)
]
schema = StructType(fields)
test = spark.createDataFrame([
Row(id=1, name=u"a", age=34),
Row(id=2, name=u"b", age=25)
], schema)
test.show()
i.e. this one
+---+----+---+
| id|name|age|
+---+----+---+
| 1| a| 34|
| 2| b| 25|
+---+----+---+
To write it to PostgreSQL I use the code:
test.write.jdbc(
url="jdbc:postgresql://localhost:5432/db",
table="test",
mode="overwrite",
properties={
"user":"root",
"password":"12345",
"driver":"org.postgresql.Driver",
"client_encoding":"utf8"
}
)
But it generates the error shown above. Cannot find the reason of this exception.
The reading of an existing table created using postres console works fine.
I will be grateful any help.

Related

Error when connecting to oracle database in pyspark

This is my code when run in pyspark env(version spark 3.1.2):
jdbcDF = spark.read \
.format("jdbc") \
.option("url", "jdbc:oracle:thin:#10.0.1.1:1521/sbank") \
.option("dbtable", "sa.a") \
.option("user", "g") \
.option("password", "zxc") \
.option("driver", "oracle.jdbc.driver.OracleDriver") \
.load()
But shows the announcement below as:
Py4JJavaError Traceback (most recent call last)
/tmp/ipykernel_29/4076487584.py in <module>
----> 1 jdbcDF = spark.read \
2 .format("jdbc") \
3 .option("url", "jdbc:oracle:thin:#10.0.1.1:1521/sbank") \
4 .option("dbtable", "sa.a") \
5 .option("user", "g") \
/usr/local/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
208 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
209 else:
--> 210 return self._df(self._jreader.load())
211
212 def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o137.load
Can anyone help me to solve that? Thank you in advance.
I add ojdbc11.jar into jars forder of spark

Keep getting the Py4JError when working with the pyspark dataframe

I have a docker container up and running in vs code. With pyspark I connect to a postgres database on my local machine:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.jars", "/opt/spark/jars/postgresql-42.2.5.jar") \
.getOrCreate()
df = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql://host.docker.internal:5432/postgres") \
.option("dbtable", "chicago_crime") \
.option("user", "postgres") \
.option("password", "postgres") \
.option("driver", "org.postgresql.Driver") \
.load()
type(df)
Output:
pyspark.sql.dataframe.DataFrame
Example code of what works:
df.printSchema()
df.select('ogc_fid').show() #(Raises a Py4JJavaError sometimes)
Example code of what does not work:
df.show(1) # Py4JJavaError and ConnectionRefusedError: [Errno 111] Connection refused
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
[... skipping hidden 1 frame]
Cell In[2], line 1
----> 1 df.show(1)
File /usr/local/lib/python3.9/site-packages/pyspark/sql/dataframe.py:606, in DataFrame.show(self, n, truncate, vertical)
605 if isinstance(truncate, bool) and truncate:
--> 606 print(self._jdf.showString(n, 20, vertical))
607 else:
File /usr/local/lib/python3.9/site-packages/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args)
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
File /usr/local/lib/python3.9/site-packages/pyspark/sql/utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
File /usr/local/lib/python3.9/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
325 if answer[1] == REFERENCE_TYPE:
...
--> 438 self.socket.connect((self.java_address, self.java_port))
439 self.stream = self.socket.makefile("rb")
440 self.is_connected = True
ConnectionRefusedError: [Errno 111] Connection refused
Anyone knows what this Py4JJavaError is? And how to overcome it?
PySpark is just a Wrapper around the actual implementation of Spark, which is written in Scala. Py4J enables you to communicate with the JVM process in Python.
That means the Py4JJavaError is only an abstraction, it tells you that the JVM process threw an Exception.
The real error is ConnectionRefusedError: [Errno 111] Connection refused.
I assume the error is caused while connecting to your Postgres instance.

Py4JJavaError: An error occurred while calling o10495.join. : java.lang.StackOverflowError

In Azure Synapse notebook, after running quite a number of functions, I'm trying to do a semi join of two dataframes where DF1 has one column called ID and the DF2 has five columns: ID, SID, Name, Term, Desc. Now the issue is everytime I start the session, I get this error. But when I run the code cell 5-6 times, it starts working. Not sure why it keeps happening.
df1 is a union of all distinct IDs from two other dataframes.
df2 = ogdata.select('SID', 'ID', 'Name', 'Term', 'Desc').distinct()
My Join:
df3 = df2.join(df1, ["uid"], "semi")
I've tried:
Changing it to left, different join syntax where I do df1.id = df2.id but always get the error everytime I start a session. Then when I run the cell 5-6 times, it works.
My error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-127-9d80a53> in <module>
1 ##teaching_data_current = teaching_data_current.join(uid_agg_course_teacher, teaching_data_current.uid == uid_agg_course_teacher.uid, "semi").drop(uid_agg_course_teacher.uid)
2 teaching_data_c = coursedata.select('SubjectID','uid').distinct()
----> 3 teaching_data_curr = teaching_data_c.join(uid_agg_course_teacher, ["uid"], "semi")
4 #teaching_data_curr = teaching_data_c.alias("t1").join(uid_agg_course_teacher.alias("t2"), teaching_data_c.uid==uid_agg_course_teacher.uid, "semi")
/opt/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py in join(self, other, on, how)
1337 on = self._jseq([])
1338 assert isinstance(how, str), "how should be a string"
-> 1339 jdf = self._jdf.join(other._jdf, on, how)
1340 return DataFrame(jdf, self.sql_ctx)
1341
~/cluster-env/env/lib/python3.8/site-packages/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
~/cluster-env/env/lib/python3.8/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o10428.join.
: java.lang.StackOverflowError
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$$anonfun$findAliases$1.applyOrElse(Analyzer.scala:1763)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$$anonfun$findAliases$1.applyOrElse(Analyzer.scala:1763)
at scala.PartialFunction.$anonfun$runWith$1$adapted(PartialFunction.scala:145)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.collect(TraversableLike.scala:359)
at scala.collection.TraversableLike.collect$(TraversableLike.scala:357)
at scala.collection.immutable.List.collect(List.scala:327)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.findAliases(Analyzer.scala:1763)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.collectConflictPlans$1(Analyzer.scala:1388)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.$anonfun$dedupRight$10(Analyzer.scala:1464)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.collectConflictPlans$1(Analyzer.scala:1464)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.$anonfun$dedupRight$10(Analyzer.scala:1464)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.collectConflictPlans$1(Analyzer.scala:1464)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.$anonfun$dedupRight$10(Analyzer.scala:1464)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.collectConflictPlans$1(Analyzer.scala:1464)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences$.$anonfun$dedupRight$10(Analyzer.scala:1464)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)

Is there a pyspark function to give me 2 decimal places on multiple dataframe columns?

I'm new to coding and am new to pyspark and python (by new I mean I am a student and am learning it).
I keep getting error in my code and I can't figure out why. what I'm trying to do is get my code to give me a 2 decimal output that looks like this. Below is a sample output of what I want my output to look like:
+------+--------+------+------+
|col_ID| f.name |bal | avg. |
+------+--------+------+------+
|1234 | Henry |350.45|400.32|
|3456 | Sam |75.12 | 50.60|
+------+--------+------+------+
But instead here's my code and here's the error I'm getting with it:
My Code:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col #import col function for column manipulation
#import pyspark.sql.functions as func
spark=SparkSession.builder.getOrCreate()
df = spark.read.csv("/user/cloudera/Default2_Data.csv", header = True, inferSchema = True) \
.withColumn("income",round(df["income"],2)) \
.withColumn("balance",func.col("balance").cast('Float'))
#df.select(col("income").alias("income")),
#col("balance").alias("balance"),
#func.round(df["income"],2).alias("income1")
df.show(15)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/opt/cloudera/parcels/SPARK2-2.4.0.cloudera2-1.cdh5.13.3.p0.1041012/lib/`spark2`/python/pyspark/sql/utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
/opt/cloudera/parcels/SPARK2-2.4.0.cloudera2-1.cdh5.13.3.p0.1041012/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in `get_return_value`(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
Py4JJavaError: An error occurred while calling o707.withColumn.
: org.apache.spark.sql.AnalysisException: Resolved attribute(s) income#1101 missing from student#1144,income#1146,default#1143,RecordID#1142,balance#1145 in operator !Project [RecordID#1142, default#1143, student#1144, balance#1145, round(income#1101, 2) AS income#1152]. Attribute(s) with the same name appear in the operation: income. Please check if the right attribute(s) are used.;;
!Project [RecordID#1142, default#1143, student#1144, balance#1145, round(income#1101, 2) AS income#1152]
+- Relation[RecordID#1142,default#1143,student#1144,balance#1145,income#1146] csv
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:42)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:95)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:326)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:85)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:85)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:95)
at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:108)
at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)
at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3406)
at org.apache.spark.sql.Dataset.select(Dataset.scala:1334)
at org.apache.spark.sql.Dataset.withColumns(Dataset.scala:2252)
at org.apache.spark.sql.Dataset.withColumn(Dataset.scala:2219)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
AnalysisException Traceback (most recent call last)
<ipython-input-102-13a967925c21> in <module>
1 df = spark.read.csv("/user/cloudera/Default2_Data.csv", header = True, inferSchema = True) \
----> 2 .withColumn("income",round(df["income"],2)) \
3 .withColumn("balance",func.col("balance").cast('Float'))
4 #df.select(col("income").alias("income")),
5 #col("balance").alias("balance"),
/opt/cloudera/parcels/SPARK2-2.4.0.cloudera2-1.cdh5.13.3.p0.1041012/lib/spark2/python/pyspark/sql/dataframe.py in withColumn(self, colName, col)
1987 """
1988 assert isinstance(col, Column), "col should be Column"
-> 1989 return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
1990
1991 #ignore_unicode_prefix
/opt/cloudera/parcels/SPARK2-2.4.0.cloudera2-1.cdh5.13.3.p0.1041012/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/cloudera/parcels/SPARK2-2.4.0.cloude`enter code here`ra2-1.cdh5.13.3.p0.1041012/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: 'Resolved attribute(s) income#1101 missing from student#1144,income#1146,default#1143,RecordID#1142,balance#1145 in operator !Project [RecordID#1142, default#1143, student#1144, balance#1145, round(income#1101, 2) AS income#1152]. Attribute(s) with the same name appear in the operation: income. Please check if the right attribute(s) are used.;;\n!Project [RecordID#1142, default#1143, student#1144, balance#1145, round(income#1101, 2) AS income#1152]\n+- Relation[RecordID#1142,default#1143,student#1144,balance#1145,income#1146] csv\n'
Replace the following line with:
withColumn("income",round(df["income"],2))
with the following:
withColumn("income",round(col('income'),2))

pyspark: AnalysisException when joining two data frame

I have two data frame created from sparkSQL:
df1 = sqlContext.sql(""" ...""")
df2 = sqlContext.sql(""" ...""")
I tried to join these two data frame on the column my_id like below:
from pyspark.sql.functions import col
combined_df = df1.join(df2, col("df1.my_id") == col("df2.my_id"), 'inner')
Then I got the following error. Any idea what I missed? Thanks!
AnalysisException Traceback (most recent call last)
<ipython-input-11-45f5313387cc> in <module>()
3 from pyspark.sql.functions import col
4
----> 5 combined_df = df1.join(df2, col("df1.my_id") == col("df2.my_id"), 'inner')
6 combined_df.take(10)
/usr/local/spark-latest/python/pyspark/sql/dataframe.py in join(self, other, on, how)
770 how = "inner"
771 assert isinstance(how, basestring), "how should be basestring"
--> 772 jdf = self._jdf.join(other._jdf, on, how)
773 return DataFrame(jdf, self.sql_ctx)
774
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/local/spark-latest/python/pyspark/sql/utils.py in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: "cannot resolve '`df1.my_id`' given input columns: [...
I think issue with your code is , you are trying to give "df1.my_id" as a column name instead of just col('my_id'). That is why the error says cannot resolve df1.my_id given input columns
you can do this without importing col.
combined_df = df1.join(df2, df1.my_id == df2.my_id, 'inner')
Not sure about pyspark but this should work if you have same field name in both dataframe
combineDf = df1.join(df2, 'my_id', 'outer')
Hope this helps!