PySpark sql for complex sql with !=NULL & NOT IN - pyspark

I have a spark dataframe like that:
sc = CassandraSparkContext(conf=conf)
sql = SQLContext(sc)
log = sc.cassandraTable("test","log_a")\
.select("m_date","userid","fsa","fsid").toDF()
sql.registerDataFrameAsTable(log, "log")
I can query easily with range in m_date like that:
query_str = ("select * from log where m_date >= %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
temp.show()
Every thing is ok with that simple query. But I have issue with this more complex query like that:
query_str = "select * from log "\
"where userid != NULL "\
"or fsa not in ("\
"select fsa from log where userid is not null)"
query_str = query_str+ ("and m_date > %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
And I meet this issue:
Py4JJavaError Traceback (most recent call last)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o25.sql.
: org.apache.spark.sql.AnalysisException: Null-aware predicate sub-queries cannot be used in nested conditions: (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))));;
Project [m_date#0L, userid#1, fsa#2, fsid#3]
+- Filter (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))))
: +- Project [fsa#2]
: +- Filter isnotnull(userid#1)
: +- SubqueryAlias log
: +- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]
+- SubqueryAlias log
+- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:207)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:78)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:78)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:91)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:52)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:67)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:632)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
During handling of the above exception, another exception occurred:
AnalysisException Traceback (most recent call last)
E:\FPT\project-spark-streaming\spark-calculate-newuser-daily.py in <module>()
76 "select fsa from log where userid is not null)"
77 query_str=query_str+ ("and m_date > %s and m_date < %s" %(1497052766,1498059766))
---> 78 temp=sql.sql(query_str)
79 pass
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\context.py in sql(self, sqlQuery)
382 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
383 """
--> 384 return self.sparkSession.sql(sqlQuery)
385
386 #since(1.0)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\session.py in sql(self, sqlQuery)
601 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
602 """
--> 603 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
604
605 #since(2.0)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\utils.py in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: 'Null-aware predicate sub-queries cannot be used in nested conditions: (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))));;\nProject [m_date#0L, userid#1, fsa#2, fsid#3]\n+- Filter (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))))\n : +- Project [fsa#2]\n : +- Filter isnotnull(userid#1)\n :
+- SubqueryAlias log\n : +- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]\n +- SubqueryAlias log\n +- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]\n'
17/12/24 20:53:17 WARN SparkEnv: Exception while deleting Spark temp dir: C:\Users\hptphuong\AppData\Local\Temp\spark-c9fd644d-de1a-47c9-9e19-cbd0b01df138\userFiles-412a0e89-c56f-4897-98e7-05cd6114855f
java.io.IOException: Failed to delete: C:\Users\hptphuong\AppData\Local\Temp\spark-c9fd644d-de1a-47c9-9e19-cbd0b01df138\userFiles-412a0e89-c56f-4897-98e7-05cd6114855f
at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1031)
at org.apache.spark.SparkEnv.stop(SparkEnv.scala:103)
at org.apache.spark.SparkContext$$anonfun$stop$11.apply$mcV$sp(SparkContext.scala:1944)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1317)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1943)
at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:581)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1948)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
17/12/24 20:53:17 ERROR ShutdownHookManager: Exception while deleting Spark temp dir: C:\Users\hptphuong\AppData\Local\Temp\spark-c9fd644d-de1a-47c9-9e19-cbd0b01df138\userFiles-412a0e89-c56f-4897-98e7-05cd6114855f
I try to separate of two condition with or, It's will be ok. But when I combine It appear this issue. I tried to replace "or" by union two table. It works. But it looks ridiculous.
Please show me how to fix it.
Thank a lot
#AKSW: soorr for lack the information about the issue. I updated my issue. Please help me.

One obvious error I see in your code is the != NULL comparison. When you want to check if something is or is not null, you should use IS NULL or IS NOT NULL respectively.
Another issue I see is not grouping the conditions using round brackets, but I'll assume that you know what you're doing with the logic.
I would suggest rewriting the query as follows and see if it works for you:
query_str = '''
SELECT *
FROM log
WHERE (m_date > {0} AND m_date < {1})
AND (userid IS NOT NULL
OR fsa NOT IN (
SELECT fsa FROM log WHERE userid IS NOT NULL
)
)'''.format(1497052766, 1498059766)
temp=sql.sql(query_str)
However, I should add a note (as already made in the comments above) that SQL support in Spark is not complete and if this works or not really depends on the Spark version and if you the columns you're using in the query are nullable columns. If that's the case, you'll have to write separate queries and join them based on your logic.

Not IN (Subquery) have some limitations in Spark 2.0 (SEE THIS).
You can still use EXISTS / NOT EXISTS.
PS: Please specify your spark version in order to help others having the same issue

Related

How do I set "for fetch only" when querying ibm db2 using the jdbc driver from spark?

I have some code to query a db2 database that works if I don't include "for fetch only," but returns an error if I do. I was wondering if it's already being done, or how I could set it.
connection_url = f"jdbc:db2://{host}:{port}/{database}:user={username};password={password};"
df = (spark
.read
.format("jdbc")
.option("driver", "com.ibm.db2.jcc.DB2Driver")
.option("url",connection_url)
.option("query",query)
.load())
return(df)
Error when I include for fetch only:
com.ibm.db2.jcc.am.SqlSyntaxErrorException: DB2 SQL Error: SQLCODE=-104, SQLSTATE=42601, SQLERRMC=for;
and the detailed is:
/databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
162 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
163 else:
--> 164 return self._df(self._jreader.load())
165
166 def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o4192.load.
: com.ibm.db2.jcc.am.SqlSyntaxErrorException: DB2 SQL Error: SQLCODE=-104, SQLSTATE=42601, SQLERRMC=for;
;), DRIVER=4.25.13
at com.ibm.db2.jcc.am.b6.a(b6.java:810)
at com.ibm.db2.jcc.am.b6.a(b6.java:66)
at com.ibm.db2.jcc.am.b6.a(b6.java:140)
at com.ibm.db2.jcc.am.k3.c(k3.java:2824)
at com.ibm.db2.jcc.am.k3.d(k3.java:2808)
at com.ibm.db2.jcc.am.k3.a(k3.java:2234)
at com.ibm.db2.jcc.am.k4.a(k4.java:8242)
at com.ibm.db2.jcc.t4.ab.i(ab.java:206)
at com.ibm.db2.jcc.t4.ab.b(ab.java:96)
at com.ibm.db2.jcc.t4.p.a(p.java:32)
at com.ibm.db2.jcc.t4.av.i(av.java:150)
at com.ibm.db2.jcc.am.k3.al(k3.java:2203)
at com.ibm.db2.jcc.am.k4.bq(k4.java:3730)
at com.ibm.db2.jcc.am.k4.a(k4.java:4609)
at com.ibm.db2.jcc.am.k4.b(k4.java:4182)
at com.ibm.db2.jcc.am.k4.bd(k4.java:780)
at com.ibm.db2.jcc.am.k4.executeQuery(k4.java:745)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.getQueryOutputSchema(JDBCRDD.scala:68)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:241)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:36)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:385)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:356)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:323)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:323)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:222)
at sun.reflect.GeneratedMethodAccessor704.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:750)
I've searched ibm's documentation, and stack overflow using every possible permutation I can think of.
I've read documentation about setting the isolation level since I also get a failure when running queries with with ur and was thinking that that if I could find out why that fails, I'd understand why for fetch only fails, (there's an answer here ) but it makes things clear as mud because I couldn't use it to find an analogous solution for for fetch only
I've looked at the db2 documentation on ibm's website, and searched stack overflow but this is eluding me.
edit: queries that run and don't run
Runs in dbvisualizer and pyspark
select
id_number
from
myschema.mytable
FETCH FIRST
10 ROWS ONLY
another one
select
id_number
from
myschema.mytable
Runs in dbvisualizer but not in pyspark
select
id_number
from
myschema.mytable
FETCH FIRST
10 ROWS ONLY FOR FETCH ONLY
another one
select
id_number
from
myschema.mytable
FOR FETCH ONLY
edit 2:
an example is that I run this code:
connection_url = f"jdbc:db2://{host}:{port}/{database}:user={username};password={password};"
df = (spark
.read
.format("jdbc")
.option("driver", "com.ibm.db2.jcc.DB2Driver")
.option("url",connection_url)
.option("query","""
select
id_number
from
myschema.mytable
FOR FETCH ONLY
""")
.load())
return(df)
and it doesn't work. and then I run this code:
connection_url = f"jdbc:db2://{host}:{port}/{database}:user={username};password={password};"
df = (spark
.read
.format("jdbc")
.option("driver", "com.ibm.db2.jcc.DB2Driver")
.option("url",connection_url)
.option("query","""
select
id_number
from
myschema.mytable
-- FOR FETCH ONLY
""")
.load())
return(df)
and it does work. and then I went into dbvisualizer, and verified that both versions of the query do work, so it's not a syntax error from what I can tell.
dbvisualizer says the database major version is 12 and minor is 1 and I believe it's z/os. I'm using the jdbc driver version 4.25.13 in both pyspark and dbvisualizer downloaded from maven here
edit 3:
this query runs fine in db visualizer, but fails in pyspark.
select
id_number
from
myschema.mytable
FOR READ ONLY
Alright. I found out what's happening. tl;dr: spark already does it.
The documentation here states:
A query that will be used to read data into Spark. The specified query will be parenthesized and used as a subquery in the FROM clause. Spark will also assign an alias to the subquery clause. As an example, spark will issue a query of the following form to the JDBC Source.
SELECT FROM (<user_specified_query>) spark_gen_alias
I'm fairly certain the relevant code is here:
val sqlText = options.prepareQuery +
s"SELECT $columnList FROM ${options.tableOrQuery} $myTableSampleClause" +
s" $myWhereClause $getGroupByClause $getOrderByClause $myLimitClause $myOffsetClause"
So FOR FETCH ONLY falls within the subquery, which is not allowed in DB2.
Fortunately though, it looks like CONCUR_READ_ONLY jdbc option is set, which is equivalent to FOR READ ONLY per documentation here
JDBC setting
Db2® cursor setting
IBM Informix® cursor setting
CONCUR_READ_ONLY
FOR READ ONLY
FOR READ ONLY
CONCUR_UPDATABLE
FOR UPDATE
FOR UPDATE
HOLD_CURSORS_OVER_COMMIT
WITH HOLD
WITH HOLD
TYPE_FORWARD_ONLY
SCROLL not specified
SCROLL not specified
TYPE_SCROLL_INSENSITIVE
INSENSITIVE SCROLL
SCROLL
TYPE_SCROLL_SENSITIVE
SENSITIVE STATIC, SENSITIVE DYNAMIC, or ASENSITIVE, depending on the cursorSensitivity Connection and DataSource property
Not supported
The relevant code in spark is:
stmt = conn.prepareStatement(sqlText,
ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
from here
As a side note, it looks like even if it wasn't specified explicitly in the code above, CONCUR_READ_ONLY is the default flag for ResultSet in java sql:
Concurrency
Description
ResultSet.CONCUR_READ_ONLY
Creates a read-only result set. This is the default
ResultSet.CONCUR_UPDATABLE
Creates an updateable result set.
source

polars: cannot connect to postgresql using pl.read_sql

according the doc in [url:https://pola-rs.github.io/polars-book/user-guide/howcani/io/read_db.html]
import polars as pl
conn = "postgres://username:password#server:port/database"
query = "SELECT * FROM foo"
pl.read_sql(query, conn)
---------------------------------------------------------------------------
PanicException Traceback (most recent call last)
Input In [120], in <module>
1 import connectorx
----> 2 df = pl.read_sql(query, conn)
File e:\bokeh\venv\lib\site-packages\polars\io.py:969, in read_sql(sql, connection_uri, partition_on, partition_range, partition_num, protocol)
903 """
904 Read a SQL query into a DataFrame.
905 Make sure to install connectorx>=0.2
(...)
966
967 """
968 if _WITH_CX:
--> 969 tbl = cx.read_sql(
970 conn=connection_uri,
971 query=sql,
972 return_type="arrow",
973 partition_on=partition_on,
974 partition_range=partition_range,
975 partition_num=partition_num,
976 protocol=protocol,
977 )
978 return from_arrow(tbl) # type: ignore[return-value]
979 else:
File e:\bokeh\venv\lib\site-packages\connectorx\__init__.py:151, in read_sql(conn, query, return_type, protocol, partition_on, partition_range, partition_num, index_col)
148 except ModuleNotFoundError:
149 raise ValueError("You need to install pyarrow first")
--> 151 result = _read_sql(
152 conn,
153 "arrow" if return_type in {"arrow", "polars"} else "arrow2",
154 queries=queries,
155 protocol=protocol,
156 partition_query=partition_query,
157 )
158 df = reconstruct_arrow(result)
159 if return_type == "polars":
**PanicException: called `Result::unwrap()` on an `Err` value: Error { kind: ConfigParse, cause: Some("unexpected EOF") }**
Postgresql is 14.2, OS is Windows 10.
The postgresql server is running well, I can psql database username in cmd.
According to the doc, I pip install pyarrow, connectorx.

Visibility of temproray tables and database tables in Spark SQL, is it possible to make a nested query to temprorary table from usual jdbc query

I have a DataFrame put as temprorary table
val dailySummariesDfVisualize =
dailySummariesDf
.orderBy("event_time").registerTempTable("raw")
I can do some extraction from it with Spark SQL:
val df = sqlContext.sql("SELECT * FROM raw")
df.show()
And the output works. Then I'd like to do a nested query to the temprorary table inside the JDBC database query like that:
val dailySensorData =
getDFFromJdbcSource(SparkSession.builder().appName("test").master("local").getOrCreate(),
s"SELECT * FROM values WHERE time in (SELECT event_time FROM raw) limit 1000000")
.persist(StorageLevel.MEMORY_ONLY_SER)
dailySensorData.show(400, false)
And here I get the exception:
org.postgresql.util.PSQLException: ERROR: relation "raw" does not exist
If I try to execute in inside the sqlContext.sql() like that
val df = sqlContext.sql("SELECT * FROM values WHERE time in (SELECT event_time FROM raw)")
df.show()
i get:
org.apache.spark.sql.AnalysisException: Table or view not found: values; line 1 pos 14;
'Project [*]
+- 'Filter 'time IN (list#4967 [])
: +- 'Project ['event_time]
: +- 'UnresolvedRelation [raw]
+- 'UnresolvedRelation [values]
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1(CheckAnalysis.scala:106)
like both values (real jdbc table) and raw (temprorary table) are not visible form it. How can I use temp table in the nested queries?
UPD
According to mazaneicha I have tried (retrieve all values here, since not able to restrics them with nested query):
val dailySummariesDfVisualize =
dailySummariesDf
.orderBy("event_time").createOrReplaceTempView("raw")
val dailySensorData =
getDFFromJdbcSource(SparkSession.builder().appName("test").master("local").getOrCreate(),
s"SELECT * FROM values").createOrReplaceTempView("values")
val df = sqlContext.sql("SELECT * FROM values WHERE time in (SELECT event_time FROM raw)")
df.explain(true)
and here is the logical plan:
= Parsed Logical Plan ==
'Project [*]
+- 'Filter 'time IN (list#5475 [])
: +- 'Project ['event_time]
: +- 'UnresolvedRelation [raw]
+- 'UnresolvedRelation [values]
== Analyzed Logical Plan ==
devicename: string, value: double, time: timestamp, coffee_machine_id: string, digital_twin_id: string, write_time: timestamp
Project [devicename#5457, value#5458, time#5459, coffee_machine_id#5460, digital_twin_id#5461, write_time#5462]
+- Filter time#5459 IN (list#5475 [])
: +- Project [event_time#4836]
: +- SubqueryAlias raw
: +- Sort [event_time#4836 ASC NULLS FIRST], true
: +- Relation[event_type#4835,event_time#4836,event_payload#4837,coffee_machine_id#4838,digital_twin_id#4839] JDBCRelation((SELECT * FROM events WHERE (event_time > '2021-03-31' or event_time < '2021-03-30') and event_type != 'Coffee_Capsule_RFID_Event' and event_type!='Coffee_Cup_RFID_Event' limit 2000000) SPARK_GEN_SUBQ_48) [numPartitions=1]
+- SubqueryAlias values
+- Relation[devicename#5457,value#5458,time#5459,coffee_machine_id#5460,digital_twin_id#5461,write_time#5462] JDBCRelation((SELECT * FROM values) SPARK_GEN_SUBQ_65) [numPartitions=1]
== Optimized Logical Plan ==
Join LeftSemi, (time#5459 = event_time#4836)
:- Relation[devicename#5457,value#5458,time#5459,coffee_machine_id#5460,digital_twin_id#5461,write_time#5462] JDBCRelation((SELECT * FROM values) SPARK_GEN_SUBQ_65) [numPartitions=1]
+- Project [event_time#4836]
+- Relation[event_type#4835,event_time#4836,event_payload#4837,coffee_machine_id#4838,digital_twin_id#4839] JDBCRelation((SELECT * FROM events WHERE (event_time > '2021-03-31' or event_time < '2021-03-30') and event_type != 'Coffee_Capsule_RFID_Event' and event_type!='Coffee_Cup_RFID_Event' limit 2000000) SPARK_GEN_SUBQ_48) [numPartitions=1]
== Physical Plan ==
SortMergeJoin [time#5459], [event_time#4836], LeftSemi
:- *(2) Sort [time#5459 ASC NULLS FIRST], false, 0
: +- Exchange hashpartitioning(time#5459, 200), true, [id=#1219]
: +- *(1) Scan JDBCRelation((SELECT * FROM values) SPARK_GEN_SUBQ_65) [numPartitions=1] [devicename#5457,value#5458,time#5459,coffee_machine_id#5460,digital_twin_id#5461,write_time#5462] PushedFilters: [], ReadSchema: struct<devicename:string,value:double,time:timestamp,coffee_machine_id:string,digital_twin_id:str...
+- *(4) Sort [event_time#4836 ASC NULLS FIRST], false, 0
+- Exchange hashpartitioning(event_time#4836, 200), true, [id=#1224]
+- *(3) Scan JDBCRelation((SELECT * FROM events WHERE (event_time > '2021-03-31' or event_time < '2021-03-30') and event_type != 'Coffee_Capsule_RFID_Event' and event_type!='Coffee_Cup_RFID_Event' limit 2000000) SPARK_GEN_SUBQ_48) [numPartitions=1] [event_time#4836] PushedFilters: [], ReadSchema: struct<event_time:timestamp>
According to mazaneicha's advice, I was able to resolve that with producing the where clause in scala from the DataFramw Rows, which are not so numerous compared to the data from whom I do the extraction query:
var collectedString = scala.collection.mutable.MutableList[String]()
for (row <- dailySummariesDfVisualize.collectAsList())
{
println(row(1))
val start = row(1)
val end = row(5)
val timeSelection = s" time > ' ${start}' and time < '${end}'"
collectedString+=timeSelection
}
val whereClause = collectedString.mkString(" or ")
println(whereClause)
val dailySensorData =
getDFFromJdbcSource(SparkSession.builder().appName("test").master("local").getOrCreate(),
s"SELECT * FROM values WHERE "+whereClause+" limit 1000000")
.persist(StorageLevel.MEMORY_ONLY_SER)
dailySensorData.show(400, false)
It does the output what I was actually needed with acceptable performance.
The formatted whereClause output is something like:
time > ' 2021-03-24 07:06:34.0' and time < '2021-03-24 07:08:34.0' or time > ' 2021-03-24 07:07:41.0' and time < '2021-03-24 07:09:41.0' or time > ' 2021-03-24 07:07:43.0' and time < '2021-03-24 07:09:43.0'
and so on

org.postgresql.util.PSQLException: ERROR: syntax error at or near ":" Java side

I trying to query to postgresql like this:
SELECT id , run_time
FROM run WHERE run.run_time > '2020-02-04' and run.run_time::date <= '2020-02-05'
my issue is the run.run_time::date
JAVA : When I print the query in debug , the query shows:
SELECT id , run_time
FROM run WHERE run.run_time > '2020-02-04' and run.run_time::date <= '2020-02-05'
(As above)
So I'm expecting this to be sent as is to DB, then I get this
ERROR log:
Hibernate:
select
id ,
run_time ,
from
run
where
run.cm_platform = 'COUGAR_RUN'
and run.cm_sw LIKE '%104%'
and run.run_time > '2020-02-04'
and run.run_time:date <= '2020-02-05'
2020-05-07 16:36:42.238 WARN 11184 --- [nio-8080-exec-1] o.h.engine.jdbc.spi.SqlExceptionHelper : SQL Error: 0, SQLState: 42601
2020-05-07 16:36:42.238 ERROR 11184 --- [nio-8080-exec-1] o.h.engine.jdbc.spi.SqlExceptionHelper : ERROR: syntax error at or near ":"
Position: 495
2020-05-07 16:36:42.254 ERROR 11184 --- [nio-8080-exec-1] o.a.c.c.C.[.[.[/].[dispatcherServlet] : Servlet.service() for servlet [dispatcherServlet] in context with path [] threw exception [Request processing failed; nested exception is org.springframework.dao.InvalidDataAccessResourceUsageException: could not extract ResultSet; SQL [n/a]; nested exception is org.hibernate.exception.SQLGrammarException: could not extract ResultSet] with root cause
org.postgresql.util.PSQLException: ERROR: syntax error at or near ":"
Position: 495
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2440) ~[postgresql-42.2.5.jar:42.2.5]
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2183) ~[postgresql-42.2.5.jar:42.2.5]
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:308) ~[postgresql-42.2.5.jar:42.2.5]
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:441) ~[postgresql-42.2.5.jar:42.2.5]
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:365) ~[postgresql-42.2.5.jar:42.2.5]
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:143) ~[postgresql-42.2.5.jar:42.2.5]
at org.postgresql.jdbc.PgPreparedStatement.executeQuery(PgPreparedStatement.java:106) ~[postgresql-42.2.5.jar:42.2.5]
at com.zaxxer.hikari.pool.ProxyPreparedStatement.executeQuery(ProxyPreparedStatement.java:52) ~[HikariCP-3.2.0.jar:na]
at com.zaxxer.hikari.pool.HikariProxyPreparedStatement.executeQuery(HikariProxyPreparedStatement.java) ~[HikariCP-3.2.0.jar:na]
at org.hibernate.engine.jdbc.internal.ResultSetReturnImpl.extract(ResultSetReturnImpl.java:60) ~[hibernate-core-5.3.10.Final.jar:5.3.10.Final]
at org.hibernate.loader.Loader.getResultSet(Loader.java:2167) ~[hibernate-core-5.3.10.Final.jar:5.3.10.Final]
at org.hibernate.loader.Loader.executeQueryStatement(Loader.java:1930) ~[hibernate-core-5.3.10.Final.jar:5.3.10.Final]
at org.hibernate.loader.Loader.executeQueryStatement(Loader.java:1892) ~[hibernate-core-5.3.10.Final.jar:5.3.10.Final]
at org.hibernate.loader.Loader.doQuery(Loader.java:937) ~[hibernate-core-5.3.10.Final.jar:5.3.10.Final]
at org.hibernate.loader.Loader.doQueryAndInitializeNonLazyCollections(Loader.java:340) ~[hibernate-core-5.3.10.Final.jar:5.3.10.Final]
When I try manually the query , it works fine BUT here I'm seeing that and run.run_time:date <= '2020-02-05' is missing :: , so I think that's the problem , how do I force it to recognize it?
Code
#Override
public List<Run> getCMstatisticDetails(SearchStatisticsForm searchStatisticsForm) {
final String tableName = " run";
String columnsName =
Arrays.stream(Run.class.getFields()).filter(e-> (e.getAnnotation(Column.class) != null || e.getAnnotation(JoinColumn.class) != null)).
map(e->(tableName + "." + getColumnName(e))).
collect(Collectors.joining(" , "));
StringBuilder queryStr = new StringBuilder();
String thirdParam = (searchStatisticsForm.getCmPlatformVersion() == null) ? "" :
tableName + ".run_time > " + "'" + searchStatisticsForm.getStartDate() + "'"
+ " and " + tableName + ".run_time::date <= " + "'" + searchStatisticsForm.getEndDate() +"'";
String queryStringBase = "select " + columnsName +
" from " + tableName + " where " + thirdParam;
queryStr.append(queryStringBase);
Query q = entityManager.createNativeQuery(queryStr.toString(), Run.class);
List<Run> l = q.getResultList();
return l;
}
I looked this up everywhere , found nothing helpful , thanks in advance :)
You need to escape the ":" characters like this:
run.run_time\\:\\:date
or you can use the cast operator
cast(run.runtime AS date)
The reason why its happens, with ::" hibernate expects params, but there is no param, so it shows error. There is a other solutions:
cast (run.run_time as date) <= '2020-02-05'
but usually it`s bad idea to do any operations(cast or functions,) with field in where part, unless it is not indexed with function.
You can rewrite it to
run.run_time < '2020-02-06'
OR
run.run_time < cast ('2020-02-05' as date) + interval '1 DAYS'

Unicode names of tables/columns in mssql db and sqlalchemy

I'm using latest sqlalchemy and latest pymssql from pip to connect mssql server 8.00.2039 (2005?) The difficulty is table and column names are in russian. Is it possible to handle this database with sqlalchemy? At least i have to make 'select ... where' queries.
engine = create_engine("mssql+pymssql://%s:%s#RTBD/rt?charset=utf8" % (settings.RT_USER, settings.RT_PWD), echo = True, encoding = 'utf8')
metadata = MetaData()
metadata.reflect(engine, only = [u"Заказы",])
orders = metadata.tables[u'Заказы']
res = engine.execute(orders.select(orders.c[u'Номер заказа'] == u'14-01-0001'))
Exception is
ValueError Traceback (most recent call last)
<ipython-input-8-50ce93243d1c> in <module>()
----> 1 engine.execute(orders.select(orders.c[orders.columns.keys()[0]] == u'14-01-0001'))
python2.7/site-packages/sqlalchemy/engine/base.pyc in execute(self, statement, *multiparams, **params)
1680
1681 connection = self.contextual_connect(close_with_result=True)
-> 1682 return connection.execute(statement, *multiparams, **params)
1683
1684 def scalar(self, statement, *multiparams, **params):
python2.7/site-packages/sqlalchemy/engine/base.pyc in execute(self, object, *multiparams, **params)
718 type(object))
719 else:
--> 720 return meth(self, multiparams, params)
721
722 def _execute_function(self, func, multiparams, params):
python2.7/site-packages/sqlalchemy/sql/elements.pyc in _execute_on_connection(self, connection, multiparams, params)
315
316 def _execute_on_connection(self, connection, multiparams, params):
--> 317 return connection._execute_clauseelement(self, multiparams, params)
318
319 def unique_params(self, *optionaldict, **kwargs):
python2.7/site-packages/sqlalchemy/engine/base.pyc in _execute_clauseelement(self, elem, multiparams, params)
815 compiled_sql,
816 distilled_params,
--> 817 compiled_sql, distilled_params
818 )
819 if self._has_events or self.engine._has_events:
python2.7/site-packages/sqlalchemy/engine/base.pyc in _execute_context(self, dialect, constructor, statement, parameters, *args)
945 parameters,
946 cursor,
--> 947 context)
948
949 if self._has_events or self.engine._has_events:
python2.7/site-packages/sqlalchemy/engine/base.pyc in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1109 )
1110
-> 1111 util.reraise(*exc_info)
1112
1113 finally:
python2.7/site-packages/sqlalchemy/engine/base.pyc in _execute_context(self, dialect, constructor, statement, parameters, *args)
938 statement,
939 parameters,
--> 940 context)
941 except Exception as e:
942 self._handle_dbapi_exception(
python2.7/site-packages/sqlalchemy/engine/default.pyc in do_execute(self, cursor, statement, parameters, context)
433
434 def do_execute(self, cursor, statement, parameters, context=None):
--> 435 cursor.execute(statement, parameters)
436
437 def do_execute_no_params(self, cursor, statement, context=None):
python2.7/site-packages/pymssql.so in pymssql.Cursor.execute (pymssql.c:6057)()
python2.7/site-packages/_mssql.so in _mssql.MSSQLConnection.execute_query (_mssql.c:9858)()
python2.7/site-packages/_mssql.so in _mssql.MSSQLConnection.execute_query (_mssql.c:9734)()
python2.7/site-packages/_mssql.so in _mssql.MSSQLConnection.format_and_run_query (_mssql.c:10814)()
python2.7/site-packages/_mssql.so in _mssql.MSSQLConnection.format_sql_command (_mssql.c:11042)()
python2.7/site-packages/_mssql.so in _mssql._substitute_params (_mssql.c:18359)()
<type 'str'>: (<type 'exceptions.UnicodeEncodeError'>, UnicodeEncodeError('ascii', u'params dictionary did not contain value for placeholder: \u041d\u043e\u043c\u0435\u0440 \u0437\u0430\u043a\u0430\u0437\u0430_1', 57, 62, 'ordinal not in range(128)'))
The query is right and ends with WHERE [Заказы].[Номер заказа] = %(Номер заказа_1)s
But info message from sqla is INFO sqlalchemy.engine.base.Engine {'\xd0\x9d\xd0\xbe\xd0\xbc\xd0\xb5\xd1\x80 \xd0\xb7\xd0\xb0\xd0\xba\xd0\xb0\xd0\xb7\xd0\xb0_1': '14-01-0001'}
The strings \xd0\x9d\xd0\xbe\xd0\xbc\xd0\xb5\xd1\x80 \xd0\xb7\xd0\xb0\xd0\xba\xd0\xb0\xd0\xb7\xd0\xb0_1 and \u041d\u043e\u043c\u0435\u0440 \u0437\u0430\u043a\u0430\u0437\u0430_1 are equal to Номер заказа_1
as stated on the mailing list, FreeTDS and such are very picky about this. The following test works for me but for the poster above it did not work:
UnixODBC 2.3.0
FreeTDS 0.91
Pyodbc 3.0.7
Linux, not OSX, OSX has tons of problems with tds / pyodbc, I’m running on a Fedora 14 machine here
Freetds setting:
[sqlserver_2008_vmware]
host = 172.16.248.142
port = 1213
tds version = 7.2
client charset = UTF8
text size = 50000000
Test script:
# coding: utf-8
from sqlalchemy import create_engine, MetaData, Table, Column, String
e = create_engine("mssql+pyodbc://scott:tiger#ms_2008", echo=True)
#e = create_engine("mssql+pymssql://scott:tiger#172.16.248.142:1213", echo=True)
m = MetaData()
t = Table(u'Заказы', m, Column(u'Номер заказа', String(50)))
m.drop_all(e)
m.create_all(e)
orders = m.tables[u'Заказы']
e.execute(orders.select(orders.c[u'Номер заказа'] == u'14-01-0001'))
part of the output:
CREATE TABLE [Заказы] (
[Номер заказа] VARCHAR(50) NULL
)
2014-03-31 20:57:16,266 INFO sqlalchemy.engine.base.Engine ()
2014-03-31 20:57:16,268 INFO sqlalchemy.engine.base.Engine COMMIT
2014-03-31 20:57:16,270 INFO sqlalchemy.engine.base.Engine SELECT [Заказы].[Номер заказа]
FROM [Заказы]
WHERE [Заказы].[Номер заказа] = ?
2014-03-31 20:57:16,270 INFO sqlalchemy.engine.base.Engine (u'14-01-0001',)