Spark-SQL dataframe count gives java.lang.ArrayIndexOutOfBoundsException

Spark-SQL dataframe count gives java.lang.ArrayIndexOutOfBoundsException - scala

I am creating a dataframe using Apache Spark version 2.3.1. When I try to count a dataframe I get following error:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 12, analitik11.{hostname}, executor 1): java.lang.ArrayIndexOutOfBoundsException: 2
at org.apache.spark.sql.vectorized.ColumnarBatch.column(ColumnarBatch.java:98)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.datasourcev2scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2770)
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2769)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
at org.apache.spark.sql.Dataset.count(Dataset.scala:2769)
... 49 elided
Caused by: java.lang.ArrayIndexOutOfBoundsException: 2
at org.apache.spark.sql.vectorized.ColumnarBatch.column(ColumnarBatch.java:98)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.datasourcev2scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
We use com.hortonworks.spark.sql.hive.llap.HiveWarehouseBuilder to connect to and read tables from Hive. Code for generating the dataframe is as follows:
val hive = com.hortonworks.spark.sql.hive.llap.HiveWarehouseBuilder.session(spark).build()
val edgesTest = hive.executeQuery("select trim(s_vno) as src ,trim(a_vno) as dst, share, administrator, account, all_share " +
"from ebyn.babs_edges_2018 where (share <> 0 or administrator <> 0 or account <> 0 or all_share <> 0) and trim(date) = '201801'")
val share_org_edges = edgesTest.alias("df1").
join(edgesTest.alias("df2"), "src").
where("df1.dst <> df2.dst").
groupBy(
greatest("df1.dst", "df2.dst").as("src"),
least("df1.dst", "df2.dst").as("dst")).
agg(max("df1.share").as("share"), max("df1.administrator").as("administrator"), max("df1.account").as("account"), max("df1.all_share").as("all_share")).persist
share_org_edges.count
Table properties are as follows:
CREATE TABLE `EBYN.BABS_EDGES_2018`(
`date` string,
`a_vno` string,
`s_vno` string,
`amount` double,
`num` int,
`share` int,
`share_ratio` int,
`administrator` int,
`account` int,
`share-all` int)
COMMENT 'Imported by sqoop on 2018/10/11 11:10:16'
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='',
'line.delim'='\n',
'serialization.format'='')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://ggmprod/warehouse/tablespace/managed/hive/ebyn.db/babs_edges_2018'
TBLPROPERTIES (
'bucketing_version'='2',
'transactional'='true',
'transactional_properties'='insert_only',
'transient_lastDdlTime'='1539245438')

Problem
The edgesTest is a dataframe with a logical plan containing a unique DataSourceV2Relation node. This DataSourceV2Relation logical plan node contains a mutable HiveWarehouseDataSourceReader that will be used to read the Hive table.
The edgesTest dataframe is used two times : as df1 and as df2.
During Spark logical plan optimization, column pruning happened two times on the same HiveWarehouseDataSourceReader mutable instance. The second column pruning overwriting the first one by setting its own required columns.
During execution the reader will fire two times the same query to Hive warehouse with the columns required by the second column pruning. The Spark generated code will not find the expected columns from Hive query result.
Solutions
Spark 2.4
DataSourceV2 has been improved, especially by SPARK-23203 DataSourceV2 should use immutable trees
Spark 2.3
Disable column pruning in the HiveWarehouseConnector datasource reader.
Hortonworks has already fixed this issue, as stated by the HDP 3.1.5 Release Notes.
We can find the correction in its HiveWarehouseConnector github repository :
if (useSpark23xReader) {
LOG.info("Using reader HiveWarehouseDataSourceReaderForSpark23x with column pruning disabled");
return new HiveWarehouseDataSourceReaderForSpark23x(params);
} else if (disablePruningPushdown) {
LOG.info("Using reader HiveWarehouseDataSourceReader with column pruning and filter pushdown disabled");
return new HiveWarehouseDataSourceReader(params);
} else {
LOG.info("Using reader PrunedFilteredHiveWarehouseDataSourceReader");
return new PrunedFilteredHiveWarehouseDataSourceReader(params);
}
Also, the HDP 3.1.5 Hive integration doc specify :
To prevent data correctness issues in this release, pruning and projection pushdown is disabled by default.
...
To prevent these issues and ensure correct results, do not enable pruning and pushdowns.

I faced the same problem and even after disabling data pruning / pushdown, it doesn't work..
The documentation is found here https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.1.4/integrating-hive/content/hive-read-write-operations.html under Pruning and Pushdowns
in Python I set:
spark.conf.set('spark.datasource.hive.warehouse.disable.pruning.and.pushdowns', 'true')
But this doesnt work. Instead I have found a solution/workaround which is to persist one of the tables (which was identified to be faulty).
df1 = df.filter(xx).join(xx).persist()
I guess from the documentation, spark does project pushdown to find the parent dataframe - this error occurs when joining df of the same dataframe, can someone explain it?
Also, let me know if it works

Related

Replacing a character in dataframe string columns in Scala

I have a task to remove all line delimiters (\n) from all string columns in a table.
The number of table columns is unknown, the code should process any table.
I wrote the code that would go through all columns in a loop, retrieve a column data type and replace the line delimiter:
//let's assume we already have a dataframe 'df' that can contain any table
df.cache()
val dfTypes = df.dtypes
for ( i <- 0 to (dfTypes.length - 1)) {
var tupCol = dfTypes(i)
if (tupCol._2 == "StringType" )
df.unpersist()
df = df.withColumn(tupCol._1, regexp_replace(col(tupCol._1), "\n", " "))
df.cache()
}
df.unpersist()
The code itself works fine, but when I run this code for ~50 tables in parallel I constantly get the following error for one random table:
18/11/20 04:31:41 WARN TaskSetManager: Lost task 9.0 in stage 6.0 (TID 29, ip-10-114-4-145.us-west-2.compute.internal, executor 1): java.io.IOException: No space left on device
at java.io.FileOutputStream.writeBytes(Native Method)
at java.io.FileOutputStream.write(FileOutputStream.java:326)
at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260)
at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190)
at java.io.FilterOutputStream.close(FilterOutputStream.java:159)
at java.io.FilterOutputStream.close(FilterOutputStream.java:159)
at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.close(UnsafeRowSerializer.scala:96)
at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:156)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I can run less or more than 50 jobs but the only one (random) keeps failing.
The jobs are running on EMR cluster with the following configuration:
Master node: r4.2xlarge x 1
Core nodes: m5.2xlarge x 3
Task nodes: m5.2xlarge x (Autoscaling from 1 to 10)
I think my code consumes a lot of memory and disc space because it creates new dataframes in a loop. But I do not see any other solution to process a table without knowing a number of string columns.
I need a suggestion of how optimize the code.
Thanks.

how to modify one column value in one row used by pyspark

I want to update value when userid=22650984.How to do it in pyspark platform?thank you for helping.
>>>xxDF.select('userid','registration_time').filter('userid="22650984"').show(truncate=False)
18/04/08 10:57:00 WARN TaskSetManager: Lost task 0.1 in stage 57.0 (TID 874, shopee-hadoop-slave89, executor 9): TaskKilled (killed intentionally)
18/04/08 10:57:00 WARN TaskSetManager: Lost task 11.1 in stage 57.0 (TID 875, shopee-hadoop-slave97, executor 16): TaskKilled (killed intentionally)
+--------+----------------------------+
|userid |registration_time |
+--------+----------------------------+
|22650984|270972-04-26 13:14:46.345152|
+--------+----------------------------+

If you want to modify a subset of your DataFrame and keep the rest unchanged, the best option would be to use pyspark.sql.functions.when() as using filter or pyspark.sql.functions.where() would remove all rows where the condition is not met.
from pyspark.sql.functions import col, when
valueWhenTrue = None # for example
df.withColumn(
"existingColumnToUpdate",
when(
col("userid") == 22650984,
valueWhenTrue
).otherwise(col("existingColumnToUpdate"))
)
When will evaluate the first argument as a boolean condition. If the condition is True, it will return the second argument. You can chain together multiple when statements as shown in this post and also this post. Or use otherwise() to specify what to do when the condition is False.
In this example, I am updating an existing column "existingColumnToUpdate". When the userid is equal to the specified value, I will update the column with valueWhenTrue. Otherwise, we will keep the value in the column unchanged.

Change Value of a Dataframe Column Based on a Filter:
from pyspark.sql.functions import lit new_df = xxDf.filter(xxDf.userid == "22650984").withColumn('clumn_to update', lit(<update_expression>)

You can use withColumn to achieve what you are looking to do:
new_df = xxDf.filter(xxDf.userid = "22650984").withColumn(xxDf.field_to_update, <update_expression>)
the update_expression would have your logic for update - could be UDF, or derived field, etc..

Finding the difference of two columns in Spark dataframes and appending to a new column

Below is my code for loading csv data into dataframe and applying the difference on two columns and appending to a new one using withColumn.The two columns I am trying to find the difference is of kind Double. Please help me in figuring out the following exception:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* Created by Guest1 on 5/10/2017.
*/
object arith extends App {
Logger.getLogger("org").setLevel(Level.ERROR)
Logger.getLogger("akka").setLevel(Level.ERROR)
val spark = SparkSession.builder().appName("Arithmetics").
config("spark.master", "local").getOrCreate()
val df =spark.read.option("header","true")
.option("inferSchema",true")
.csv("./Input/Arith.csv").persist()
// df.printSchema()
val sim =df("Average Total Payments") -df("Average Medicare Payments").show(5)
}
I am getting the following exception:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Exception in thread "main" org.apache.spark.sql.AnalysisException: Cannot resolve column name "Average Total Payments" among (DRG Definition, Provider Id, Provider Name, Provider Street Address, Provider City, Provider State, Provider Zip Code, Hospital Referral Region Description, Total Discharges , Average Covered Charges , Average Total Payments , Average Medicare Payments);
at org.apache.spark.sql.Dataset$$anonfun$resolve$1.apply(Dataset.scala:219)
at org.apache.spark.sql.Dataset$$anonfun$resolve$1.apply(Dataset.scala:219)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.Dataset.resolve(Dataset.scala:218)
at org.apache.spark.sql.Dataset.col(Dataset.scala:1073)
at org.apache.spark.sql.Dataset.apply(Dataset.scala:1059)
at arith$.delayedEndpoint$arith$1(arith.scala:19)
at arith$delayedInit$body.apply(arith.scala:7)
at scala.Function0$class.apply$mcV$sp(Function0.scala:34)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at arith$.main(arith.scala:7)
at arith.main(arith.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

There are multiple issues here.
First if you look at the exception, it basically tells you that there is no "Average Total Payments" column in the dataframe (it also helpfully gives you the columns it sees). It seems the column name read from the csv has an extra space at the end.
Second df("Average Total Payments") and df("Average Medicare Payments") are columns.
You are trying to call show on df("Average medicate payments"). Show is not a member of column (and on dataframe it returns unit so you couldn't do df("Average Total Payments") -df("Average Medicare Payments").show(5) anyway because that would be Column - Unit).
What you want to do is define a new column which is the difference between the two and add it to the dataframe as a new column. Then you want to select just that column and show it. For example:
val sim = df.withColumn("diff",df("Average Total Payments") -df("Average Medicare Payments"))
sim.select("diff").show(5)

Cassandra - No Rows Returned

Here is my Table:
CREATE TABLE mytable
(
id uuid,
day text,
mytime timestamp,
value text,
status int,
PRIMARY KEY ((id, day), mytime )
)
WITH CLUSTERING ORDER BY (mytime desc)
;
Here is the Index:
CREATE INDEX IF NOT EXISTS idx_status ON mytable (status);
When I run this select statement, I get the expected results:
select * from mytable
where id = 38403e1e-44b0-11e4-bd3d-005056a93afd
AND day = '2014-10-29'
;
62 rows are returned as a result of this query.
If I add to this query to include the index column:
select * from mytable
where id = 38403e1e-44b0-11e4-bd3d-005056a93afd
AND day = '2014-10-29'
AND status = 5
;
zero rows are returned. (there are several records with status = 5)
If I query the table...looking ONLY for a specific index value:
select * from mytable
where status = 5
;
zero rows are also returned.
I'm at a loss. I don't understand what exactly is taking place.
I am on a 3 node cluster, replication level 3. Cassandra 2.1.3
Could this be a configuration issue at all..in cassandra.yaml ?
Or...is there an issue with my select statement?
Appreciate the assistance, thanks.
UPDATE:
I am seeing this in the system.log file, ideas? ...
ERROR [CompactionExecutor:1266] 2015-03-24 15:20:26,596 CassandraDaemon.java:167 - Exception in thread Thread[CompactionExecutor:1266,1,main]
java.lang.AssertionError: /cdata/cassandra/data/my_table-c5f756b5318532afb494483fa1828675/my_table.idx_status-ka-32-Data.db
at org.apache.cassandra.io.sstable.SSTableReader.getApproximateKeyCount(SSTableReader.java:235) ~[apache-cassandra-2.1.3.jar:2.1.3]
at org.apache.cassandra.db.compaction.CompactionTask.runMayThrow(CompactionTask.java:153) ~[apache-cassandra-2.1.3.jar:2.1.3]
at org.apache.cassandra.utils.WrappedRunnable.run(WrappedRunnable.java:28) ~[apache-cassandra-2.1.3.jar:2.1.3]
at org.apache.cassandra.db.compaction.CompactionTask.executeInternal(CompactionTask.java:76) ~[apache-cassandra-2.1.3.jar:2.1.3]
at org.apache.cassandra.db.compaction.AbstractCompactionTask.execute(AbstractCompactionTask.java:59) ~[apache-cassandra-2.1.3.jar:2.1.3]
at org.apache.cassandra.db.compaction.CompactionManager$BackgroundCompactionTask.run(CompactionManager.java:240) ~[apache-cassandra-2.1.3.jar:2.1.3]
at java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source) ~[na:1.7.0_51]
at java.util.concurrent.FutureTask.run(Unknown Source) ~[na:1.7.0_51]
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) [na:1.7.0_51]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) [na:1.7.0_51]
at java.lang.Thread.run(Unknown Source) [na:1.7.0_51]

I ran your steps above and was able to query rows by status=5 just fine. One thing I can suggest, is to try rebuilding your index. Try this from a command prompt:
nodetool rebuild_index mykeyspace mytable idx_status
Otherwise, IMO the best way to solve this, is not with a secondary index. If you know that you're going to have to support a query (especially with a large dataset) by status, then I would seriously consider building a specific, additional "query table" for it.
CREATE TABLE mytablebystatus (id uuid, day text, mytime timestamp, value text, status int,
PRIMARY KEY ((status),day,mytime,id));
This would support queries only by status, or status and day sorted by mytime. In summary, I would experiment with a few different PRIMARY KEY definitions, and see which better-suits your query patterns. That way, you can avoid having to use ill-performing secondary indexes all together.

Hive error exporting data as BSON object

I am trying to create table (CTAS) from hive and want to write the file in BSON format, inorder to import it into MongoDb.
Here is my query:
create table if not exists rank_locn
ROW FORMAT SERDE "com.mongodb.hadoop.hive.BSONSerde"
STORED AS INPUTFORMAT "com.mongodb.hadoop.BSONFileInputFormat"
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
as
select RGN_OVRHD_NBR,DM_OVRHD_NBR,LOCN_NBR,Derived,
rank() OVER (ORDER BY DERIVED DESC) as NationalRnk,
rank() OVER (PARTITION BY RGN_OVRHD_NBR ORDER BY DERIVED DESC) as RegionRnk,
rank() OVER (PARTITION BY DM_OVRHD_NBR ORDER BY DERIVED DESC) as DistrictRnk
from Locn_Dim_Values
where Derived between -999999 and 999999;
Three jobs are launched. The last reduce job is failing. Error log is as follows:
java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row (tag=0) {"key":{"reducesinkkey0":78133,"reducesinkkey1":143.82632293080053},"value":{"_col0":1,"_col1":12,"_col2":79233,"_col3":78133,"_col4":1634,"_col5":143.82632293080053},"alias":0}
at org.apache.hadoop.hive.ql.exec.ExecReducer.reduce(ExecReducer.java:274)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:522)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row (tag=0) {"key":{"reducesinkkey0":78133,"reducesinkkey1":143.82632293080053},"value":{"_col0":1,"_col1":12,"_col2":79233,"_col3":78133,"_col4":1634,"_col5":143.82632293080053},"alias":0}
at org.apache.hadoop.hive.ql.exec.ExecReducer.reduce(ExecReducer.java:262)
... 7 more
Caused by: java.lang.NullPointerException
at org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat$1.write(HiveIgnoreKeyTextOutputFormat.java:91)
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:637)
at org.apache.hadoop.hive.ql.exec.Operator.process(Operator.java:502)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:832)
at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84)
at org.apache.hadoop.hive.ql.exec.Operator.process(Operator.java:502)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:832)
at org.apache.hadoop.hive.ql.exec.PTFOperator.executeWindowExprs(PTFOperator.java:341)
at org.apache.hadoop.hive.ql.exec.PTFOperator.processInputPartition(PTFOperator.java:198)
at org.apache.hadoop.hive.ql.exec.PTFOperator.processOp(PTFOperator.java:130)
at org.apache.hadoop.hive.ql.exec.Operator.process(Operator.java:502)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:832)
at org.apache.hadoop.hive.ql.exec.ExtractOperator.processOp(ExtractOperator.java:45)
at org.apache.hadoop.hive.ql.exec.Operator.process(Operator.java:502)
at org.apache.hadoop.hive.ql.exec.ExecReducer.reduce(ExecReducer.java:253)
... 7 more
Please help me resolve the issue.

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Spark-SQL dataframe count gives java.lang.ArrayIndexOutOfBoundsException - scala

Related

Replacing a character in dataframe string columns in Scala

how to modify one column value in one row used by pyspark

Finding the difference of two columns in Spark dataframes and appending to a new column

Cassandra - No Rows Returned

Hive error exporting data as BSON object

Categories

Resources