Beam DirectRunner Calcite can't specify name - apache-beam

I'm running a simplified version of this beam tutorial, but running it using the DirectRunner on my local machine.
import apache_beam as beam
from apache_beam.transforms.sql import SqlTransform
import os
with beam.Pipeline() as p:
rows = (p |
beam.Create([
beam.Row(col1="val1", col2="col2_val1"),
beam.Row(col1="val2", col2="col2_val2"),
]
))
({"my_table": rows} | SqlTransform("""SELECT * FROM my_table"""))
If I change my_table to PCOLLECTION it works (although for it to really work I need to just pass in rows instead of the dict.
The error message I get:
Traceback (most recent call last):
File "./lib/scratch/test_join.py", line 12, in <module>
({"my_table": rows} | SqlTransform("""SELECT * FROM my_table"""))
File "/Users/steeling/src/versions/lib/python3.7/site-packages/apache_beam/transforms/ptransform.py", line 606, in __ror__
result = p.apply(self, pvalueish, label)
File "/Users/steeling/src/versions/lib/python3.7/site-packages/apache_beam/pipeline.py", line 694, in apply
pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
File "/Users/steeling/src/versions/lib/python3.7/site-packages/apache_beam/runners/runner.py", line 185, in apply
return m(transform, input, options)
File "/Users/steeling/src/versions/lib/python3.7/site-packages/apache_beam/runners/runner.py", line 215, in apply_PTransform
return transform.expand(input)
File "/Users/steeling/src/versions/lib/python3.7/site-packages/apache_beam/transforms/external.py", line 305, in expand
raise RuntimeError(response.error)
RuntimeError: org.apache.beam.sdk.extensions.sql.impl.ParseException: Unable to parse query SELECT * FROM my_table
at org.apache.beam.sdk.extensions.sql.impl.CalciteQueryPlanner.convertToBeamRel(CalciteQueryPlanner.java:214)
at org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv.parseQuery(BeamSqlEnv.java:111)
at org.apache.beam.sdk.extensions.sql.SqlTransform.expand(SqlTransform.java:171)
at org.apache.beam.sdk.extensions.sql.SqlTransform.expand(SqlTransform.java:109)
at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:548)
at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:499)
at org.apache.beam.sdk.expansion.service.ExpansionService$TransformProvider.apply(ExpansionService.java:367)
at org.apache.beam.sdk.expansion.service.ExpansionService.expand(ExpansionService.java:470)
at org.apache.beam.sdk.expansion.service.ExpansionService.expand(ExpansionService.java:546)
at org.apache.beam.model.expansion.v1.ExpansionServiceGrpc$MethodHandlers.invoke(ExpansionServiceGrpc.java:219)
at org.apache.beam.vendor.grpc.v1p36p0.io.grpc.stub.ServerCalls$UnaryServerCallHandler$UnaryServerCallListener.onHalfClose(ServerCalls.java:182)
at org.apache.beam.vendor.grpc.v1p36p0.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.halfClosed(ServerCallImpl.java:331)
at org.apache.beam.vendor.grpc.v1p36p0.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1HalfClosed.runInContext(ServerImpl.java:797)
at org.apache.beam.vendor.grpc.v1p36p0.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at org.apache.beam.vendor.grpc.v1p36p0.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:123)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.tools.ValidationException: org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.runtime.CalciteContextException: From line 1, column 15 to line 1, column 22: Object 'my_table' not found
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.prepare.PlannerImpl.validate(PlannerImpl.java:217)
at org.apache.beam.sdk.extensions.sql.impl.CalciteQueryPlanner.convertToBeamRel(CalciteQueryPlanner.java:183)
... 17 more
Caused by: org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.runtime.CalciteContextException: From line 1, column 15 to line 1, column 22: Object 'my_table' not found
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.runtime.Resources$ExInstWithCause.ex(Resources.java:463)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.SqlUtil.newContextException(SqlUtil.java:824)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.SqlUtil.newContextException(SqlUtil.java:809)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.newValidationError(SqlValidatorImpl.java:4805)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.IdentifierNamespace.resolveImpl(IdentifierNamespace.java:172)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.IdentifierNamespace.validateImpl(IdentifierNamespace.java:177)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.AbstractNamespace.validate(AbstractNamespace.java:84)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateNamespace(SqlValidatorImpl.java:995)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateQuery(SqlValidatorImpl.java:955)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateFrom(SqlValidatorImpl.java:3109)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateFrom(SqlValidatorImpl.java:3091)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateSelect(SqlValidatorImpl.java:3363)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SelectNamespace.validateImpl(SelectNamespace.java:60)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.AbstractNamespace.validate(AbstractNamespace.java:84)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateNamespace(SqlValidatorImpl.java:995)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateQuery(SqlValidatorImpl.java:955)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.SqlSelect.validate(SqlSelect.java:216)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validateScopedExpression(SqlValidatorImpl.java:930)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorImpl.validate(SqlValidatorImpl.java:637)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.prepare.PlannerImpl.validate(PlannerImpl.java:215)
... 18 more
Caused by: org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.sql.validate.SqlValidatorException: Object 'my_table' not found
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.runtime.Resources$ExInstWithCause.ex(Resources.java:463)
at org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.runtime.Resources$ExInst.ex(Resources.java:572)
... 37 more

You encounter this error since you only have one PCollection. It seems that Apache Beam will use PCOLLECTION as table source even if you created a dictionary that has a key value pair of {"my_table": rows}.
As a workaround, if you would like to explicitly define a table name in your SQL statement you can create a temporary PCollection that contains dummy values. Create a dictionary that have a key value pair of the two PCollections.
import apache_beam as beam
from apache_beam.transforms.sql import SqlTransform
import os
with beam.Pipeline() as p:
rows = (p |
"create rows" >> beam.Create([
beam.Row(col1="val1", col2="col2_val1"),
beam.Row(col1="val2", col2="col2_val2"),
]
))
rows_2 = (p |
"create rows_2" >> beam.Create([
beam.Row(col1_1="val1", col2_1="123"),
]
))
({"my_table": rows, "my_table2": rows_2} | SqlTransform("""SELECT * FROM my_table""")
| beam.Map(lambda row: "col1: %s, col2: %s" % (row.col1,row.col2))
| beam.Map(print))
The output is:
You can open a Apache Beam JIRA issue to request if your use case could be done on the future releases.

Related

Getting error when I try to create an iceberg table using dataFrame.write() in spark and store it in a cloud Filesystem source

Following is the script I wrote:
var df2 = spark.read.parquet("<file_path>")
df2.write.format("iceberg").save(<destination_path>)
When I ran the script I am getting the following error:
RuntimeException: Failed to get table info from metastore gs://dremio-qa/flatten.listofstructwithnulls30_iceberg
Caused by: MetaException: Exception thrown when executing query : SELECT DISTINCT 'org.apache.hadoop.hive.metastore.model.MTable' AS NUCLEUS_TYPE,A0.CREATE_TIME,A0.LAST_ACCESS_TIME,A0.OWNER,A0.RETENTION,A0.IS_REWRITE_ENABLED,A0.TBL_NAME,A0.TBL_TYPE,A0.TBL_ID FROM TBLS A0 LEFT OUTER JOIN DBS B0 ON A0.DB_ID = B0.DB_ID WHERE A0.TBL_NAME = ? AND B0.`NAME` = ?
Caused by: JDOException: Exception thrown when executing query : SELECT DISTINCT 'org.apache.hadoop.hive.metastore.model.MTable' AS NUCLEUS_TYPE,A0.CREATE_TIME,A0.LAST_ACCESS_TIME,A0.OWNER,A0.RETENTION,A0.IS_REWRITE_ENABLED,A0.TBL_NAME,A0.TBL_TYPE,A0.TBL_ID FROM TBLS A0 LEFT OUTER JOIN DBS B0 ON A0.DB_ID = B0.DB_ID WHERE A0.TBL_NAME = ? AND B0.`NAME` = ?
Caused by: SQLSyntaxErrorException: (conn=65340) Unknown column 'A0.IS_REWRITE_ENABLED' in 'field list'
Caused by: MariaDbSqlException: Unknown column 'A0.IS_REWRITE_ENABLED' in 'field list'
Caused by: SQLException: Unknown column 'A0.IS_REWRITE_ENABLED' in 'field list'
data.write
.format("iceberg")
.mode("append")
.save("db.table")
You should user db.table insteand of '<destination_path>'

Syntax error at or near "order" (Scala with Quill, Doobie and PostgreSQL)

I am using Quill with Doobie and PostgreSQL (org.tpolecat.doobie-quill artifact with version 0.13.1).
This code
case class SomeRecord(id: Int, order: Int, name: String)
val record = SomeRecord(0, 0, "test")
run(
quote(
querySchema[SomeRecord]("some_table")
).insert(lift(record))
)
Will end up in error message in runtime:
org.postgresql.util.PSQLException: ERROR: syntax error at or near "order"
Position: 46
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2553)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2285)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:323)
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:481)
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:401)
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:164)
at org.postgresql.jdbc.PgPreparedStatement.executeUpdate(PgPreparedStatement.java:130)
at com.zaxxer.hikari.pool.ProxyPreparedStatement.executeUpdate(ProxyPreparedStatement.java:61)
at com.zaxxer.hikari.pool.HikariProxyPreparedStatement.executeUpdate(HikariProxyPreparedStatement.java)
at doobie.free.KleisliInterpreter$PreparedStatementInterpreter.$anonfun$executeUpdate$5(kleisliinterpreter.scala:955)
at doobie.free.KleisliInterpreter$PreparedStatementInterpreter.$anonfun$executeUpdate$5$adapted(kleisliinterpreter.scala:955)
at doobie.free.KleisliInterpreter.$anonfun$primitive$2(kleisliinterpreter.scala:109)
It seems that Quill does not escape keyword-like column names, so "order" (and other keywords) columns in it's query will always fail. See Escaping keyword-like column names in Postgres . The workaround is to rename the column in table (and corresponding case classes).

Join two tables using pyspark hive context

I am seeing below error when joining two hive tables using pyspark hive context .
error:
""") File
"/usr/hdp/2.3.4.7-4/spark/python/lib/pyspark.zip/pyspark/sql/context.py",
line 552, in sql File
"/usr/hdp/2.3.4.7-4/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in call File
"/usr/hdp/2.3.4.7-4/spark/python/lib/pyspark.zip/pyspark/sql/utils.py",
line 36, in deco File
"/usr/hdp/2.3.4.7-4/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py",
line 300, in get_return_value py4j.protocol.Py4JJavaError: An error
occurred while calling o41.sql. : org.apache.spark.SparkException: Job
cancelled because SparkContext was shut down EX:
lsf.registerTempTable('temp_table')
out = hc.sql(
"""INSERT OVERWRITE TABLE AAAAAA PARTITION (day ='2017-09-20')
SELECT tt.*,ht.id
FROM temp_table tt
JOIN hive_table ht
ON tt.id = ht.id
""")
Also how to parameterize day ?

Load pandas dataframe into Spark cluster

I have a postgres database and I want to run a query and load a table into spark dataframe. some columns of my database are array. For example:
=> select id, f_2 from raw limit 1;
will return
id | f_2
---------+-----------
1 | {{140,130},{NULL,NULL},{NULL,NULL}}
what I want is accessing to 140 (first element of inner array) that is easy in postgres using this query:
=> select id, f_2[1][1] from raw limit 1;
id | f_2
---------+-----------
1 | 140
but I want to load it into spark dataframe and here is my code to load data:
df = sqlContext.sql("""
select id as id,
f_2 as A
from raw
""")
and return this error:
Py4JJavaError: An error occurred while calling o560.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage 4.0 (TID 4, localhost, executor driver): java.lang.ClassCastException: [Ljava.lang.Integer; cannot be cast to java.lang.Integer
and then I tried this one:
df = sqlContext.sql("""
select id as id,
f_2[0] as A
from raw
""")
and got same error and then tried this one:
df = sqlContext.sql("""
select id as id,
f_2[0][0] as A
from raw
""")
and return this error:
ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))
AnalysisException: u"Can't extract value from f_2#32685[0];"

Cassandra errors when trying the new CQL 3

I downloaded Cassandra 1.1.1 and launched cqlsh under the version 3
I tried to create a new column family:
CREATE TABLE stats (
pid blob,
period int,
targetid blob,
sum counter,
PRIMARY KEY (pid, period, targetid)
);
But I got this:
Traceback (most recent call last):
File "./cqlsh", line 908, in perform_statement
self.cursor.execute(statement, decoder=decoder)
File "./../lib/cql-internal-only-1.0.10.zip/cql-1.0.10/cql/cursor.py", line 117, in execute
response = self.handle_cql_execution_errors(doquery, prepared_q, compress)
File "./../lib/cql-internal-only-1.0.10.zip/cql-1.0.10/cql/cursor.py", line 132, in handle_cql_execution_errors
return executor(*args, **kwargs)
File "./../lib/cql-internal-only-1.0.10.zip/cql-1.0.10/cql/cassandra/Cassandra.py", line 1583, in execute_cql_query
self.send_execute_cql_query(query, compression)
File "./../lib/cql-internal-only-1.0.10.zip/cql-1.0.10/cql/cassandra/Cassandra.py", line 1593, in send_execute_cql_query
self.oprot.trans.flush()
File "./../lib/thrift-python-internal-only-0.7.0.zip/thrift/transport/TTransport.py", line 293, in flush
self._trans.write(buf)
File "./../lib/thrift-python-internal-only-0.7.0.zip/thrift/transport/TSocket.py", line 117, in write
plus = self.handle.send(buff)
error: [Errno 32] Broken pipe
And on the server console:
Error occurred during processing of message.
java.lang.IllegalArgumentException
at java.nio.Buffer.limit(Buffer.java:247)
at org.apache.cassandra.db.marshal.AbstractCompositeType.getBytes(AbstractCompositeType.java:51)
at org.apache.cassandra.db.marshal.AbstractCompositeType.getWithShortLength(AbstractCompositeType.java:60)
at org.apache.cassandra.db.marshal.AbstractCompositeType.getString(AbstractCompositeType.java:140)
at org.apache.cassandra.config.CFMetaData.validate(CFMetaData.java:929)
at org.apache.cassandra.service.MigrationManager.announceNewColumnFamily(MigrationManager.java:131)
at org.apache.cassandra.cql3.statements.CreateColumnFamilyStatement.announceMigration(CreateColumnFamilyStatement.java:83)
at org.apache.cassandra.cql3.statements.SchemaAlteringStatement.execute(SchemaAlteringStatement.java:99)
at org.apache.cassandra.cql3.QueryProcessor.processStatement(QueryProcessor.java:108)
at org.apache.cassandra.cql3.QueryProcessor.process(QueryProcessor.java:121)
at org.apache.cassandra.thrift.CassandraServer.execute_cql_query(CassandraServer.java:1237)
at org.apache.cassandra.thrift.Cassandra$Processor$execute_cql_query.getResult(Cassandra.java:3542)
at org.apache.cassandra.thrift.Cassandra$Processor$execute_cql_query.getResult(Cassandra.java:3530)
at org.apache.thrift.ProcessFunction.process(ProcessFunction.java:32)
at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:34)
at org.apache.cassandra.thrift.CustomTThreadPoolServer$WorkerProcess.run(CustomTThreadPoolServer.java:186)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:680)
I'd suggest reporting bugs at https://issues.apache.org/jira/browse/CASSANDRA.