Error:
Exception in thread "main" java.lang.RuntimeException
{"errors":[{"debugInfo":"generic::failed_precondition: Repeated record added outside of an array.","reason":"invalid"}],"index":0}
Language: Scala
Gradle bigquery dependency:
compile "com.google.apis:google-api-services-bigquery:v2-rev326-1.22.0"
The code to generate table schema:
import scala.collection.JavaConversions._
val orderTableSchema = new TableSchema()
orderTableSchema.setFields(Seq(
new TableFieldSchema().setName("revisionId").setType("STRING").setMode("REQUIRED"),
new TableFieldSchema().setName("executions").setType("RECORD").setMode("REPEATED").setFields(Seq(
new TableFieldSchema().setName("ts").setType("INTEGER"),
new TableFieldSchema().setName("price").setType("FLOAT"),
new TableFieldSchema().setName("qty").setType("INTEGER")
))
))
The table is created successfully with correct schema for executions column as shown in BigQuery web ui:
revisionId STRING REQUIRED
executions RECORD REPEATED
executions RECORD REPEATED
executions.ts INTEGER NULLABLE
executions.price FLOAT NULLABLE
executions.qty INTEGER NULLABLE
The code to insert data which fails:
import scala.collection.JavaConversions._
val row = new TableRow()
.set("revisionId", "revision1")
.set("executions", Seq(
new TableRow().set("ts", 1L).set("price", 100.01).set("qty", 1000)
))
val content = new TableDataInsertAllRequest().setRows(Seq(
new TableDataInsertAllRequest.Rows().setJson(row)
))
val insertAll = bigQuery.tabledata().insertAll(projectId, datasetId, "order", content)
val insertResponse = insertAll.execute()
if (insertResponse.getInsertErrors != null) {
insertResponse.getInsertErrors.foreach(println)
// this prints:
// {"errors":[{"debugInfo":"generic::failed_precondition: Repeated record added outside of an array.","reason":"invalid"}],"index":0}
// throw to just terminate early for demo
throw new RuntimeException()
}
Found the problem.
It's something with scala-to-java collections conversions mess. I had to explicitly add the conversion using JavaConversions.seqAsJavaList and it magically started working
val row = new TableRow()
.set("revisionId", s"revisionId$v")
.set("executions", JavaConversions.seqAsJavaList(Seq(
new TableRow().set("ts", 1L).set("price", 100.01).set("qty", 1000),
new TableRow().set("ts", 2L).set("price", 100.02).set("qty", 2000),
new TableRow().set("ts", 3L).set("price", 100.03).set("qty", 3000)
)))
Related
I am trying to execute the Data Generator function provided my Microsoft to test streaming data to Event Hubs.
Unfortunately, I keep on getting the error
Processing failure: No such file or directory
When I try and execute the function:
%scala
DummyDataGenerator.start(15)
Can someone take a look at the code and help decipher why I'm getting the error:
class DummyDataGenerator:
streamDirectory = "/FileStore/tables/flight"
None # suppress output
I'm not sure how the above cell gets called into the function DummyDataGenerator
%scala
import scala.util.Random
import java.io._
import java.time._
// Notebook #2 has to set this to 8, we are setting
// it to 200 to "restore" the default behavior.
spark.conf.set("spark.sql.shuffle.partitions", 200)
// Make the username available to all other languages.
// "WARNING: use of the "current" username is unpredictable
// when multiple users are collaborating and should be replaced
// with the notebook ID instead.
val username = com.databricks.logging.AttributionContext.current.tags(com.databricks.logging.BaseTagDefinitions.TAG_USER);
spark.conf.set("com.databricks.training.username", username)
object DummyDataGenerator extends Runnable {
var runner : Thread = null;
val className = getClass().getName()
val streamDirectory = s"dbfs:/tmp/$username/new-flights"
val airlines = Array( ("American", 0.17), ("Delta", 0.12), ("Frontier", 0.14), ("Hawaiian", 0.13), ("JetBlue", 0.15), ("United", 0.11), ("Southwest", 0.18) )
val reasons = Array("Air Carrier", "Extreme Weather", "National Aviation System", "Security", "Late Aircraft")
val rand = new Random(System.currentTimeMillis())
var maxDuration = 3 * 60 * 1000 // default to three minutes
def clean() {
System.out.println("Removing old files for dummy data generator.")
dbutils.fs.rm(streamDirectory, true)
if (dbutils.fs.mkdirs(streamDirectory) == false) {
throw new RuntimeException("Unable to create temp directory.")
}
}
def run() {
val date = LocalDate.now()
val start = System.currentTimeMillis()
while (System.currentTimeMillis() - start < maxDuration) {
try {
val dir = s"/dbfs/tmp/$username/new-flights"
val tempFile = File.createTempFile("flights-", "", new File(dir)).getAbsolutePath()+".csv"
val writer = new PrintWriter(tempFile)
for (airline <- airlines) {
val flightNumber = rand.nextInt(1000)+1000
val deptTime = rand.nextInt(10)+10
val departureTime = LocalDateTime.now().plusHours(-deptTime)
val (name, odds) = airline
val reason = Random.shuffle(reasons.toList).head
val test = rand.nextDouble()
val delay = if (test < odds)
rand.nextInt(60)+(30*odds)
else rand.nextInt(10)-5
println(s"- Flight #$flightNumber by $name at $departureTime delayed $delay minutes due to $reason")
writer.println(s""" "$flightNumber","$departureTime","$delay","$reason","$name" """.trim)
}
writer.close()
// wait a couple of seconds
//Thread.sleep(rand.nextInt(5000))
} catch {
case e: Exception => {
printf("* Processing failure: %s%n", e.getMessage())
return;
}
}
}
println("No more flights!")
}
def start(minutes:Int = 5) {
maxDuration = minutes * 60 * 1000
if (runner != null) {
println("Stopping dummy data generator.")
runner.interrupt();
runner.join();
}
println(s"Running dummy data generator for $minutes minutes.")
runner = new Thread(this);
runner.run();
}
def stop() {
start(0)
}
}
DummyDataGenerator.clean()
displayHTML("Imported streaming logic...") // suppress output
you should be able to use the Databricks Labs Data Generator on the Databricks community edition. I'm providing the instructions below:
Running Databricks Labs Data Generator on the community edition
The Databricks Labs Data Generator is a Pyspark library so the code to generate the data needs to be Python. But you should be able to create a view on the generated data and consume it from Scala if that's your preferred language.
You can install the framework on the Databricks community edition by creating a notebook with the cell
%pip install git+https://github.com/databrickslabs/dbldatagen
Once it's installed you can then use the library to define a data generation spec and by using build, generate a Spark dataframe on it.
The following example shows generation of batch data similar to the data set you are trying to generate. This should be placed in a separate notebook cell
Note - here we generate 10 million records to illustrate ability to create larger data sets. It can be used to generate datasets much larger than that
%python
num_rows = 10 * 1000000 # number of rows to generate
num_partitions = 8 # number of Spark dataframe partitions
delay_reasons = ["Air Carrier", "Extreme Weather", "National Aviation System", "Security", "Late Aircraft"]
# will have implied column `id` for ordinal of row
flightdata_defn = (dg.DataGenerator(spark, name="flight_delay_data", rows=num_rows, partitions=num_partitions)
.withColumn("flightNumber", "int", minValue=1000, uniqueValues=10000, random=True)
.withColumn("airline", "string", minValue=1, maxValue=500, prefix="airline", random=True, distribution="normal")
.withColumn("original_departure", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute", random=True)
.withColumn("delay_minutes", "int", minValue=20, maxValue=600, distribution=dg.distributions.Gamma(1.0, 2.0))
.withColumn("delayed_departure", "timestamp", expr="cast(original_departure as bigint) + (delay_minutes * 60) ", baseColumn=["original_departure", "delay_minutes"])
.withColumn("reason", "string", values=delay_reasons, random=True)
)
df_flight_data = flightdata_defn.build()
display(df_flight_data)
You can find information on how to generate streaming data in the online documentation at https://databrickslabs.github.io/dbldatagen/public_docs/using_streaming_data.html
You can create a named temporary view over the data so that you can access it from SQL or Scala using one of two methods:
1: use createOrReplaceTempView
df_flight_data.createOrReplaceTempView("delays")
2: use options for build. In this case the name passed to the Data Instance initializer will be the name of the view
i.e
df_flight_data = flightdata_defn.build(withTempView=True)
This code will not work on the community edition because of this line:
val dir = s"/dbfs/tmp/$username/new-flights"
as there is no DBFS fuse on Databricks community edition (it's supported only on full Databricks). It's potentially possible to make it working by:
Changing that directory to local directory, like, /tmp or something like
adding a code (after writer.close()) to list flights-* files in that local directory, and using dbutils.fs.mv to move them into streamDirectory
I'm shifting the rows in an excel sheet and inserting a new row at the beginning of the sheet. However, regardless of how many rows I shift and insert, I seem to be ending up with one less row than I should be.
import org.apache.poi.ss.usermodel.Row
import Row.MissingCellPolicy._
import org.apache.poi.ss.usermodel.Sheet
import org.apache.poi.ss.usermodel.Workbook
import org.apache.poi.ss.util.CellRangeAddress
import org.apache.poi.ss.util.WorkbookUtil.createSafeSheetName
import org.apache.poi.xssf.usermodel.XSSFWorkbook
def shiftAndInsertRow(sheet: Sheet) = {
val rowInsertionPoint = 0
// shift all the rows down
val lastRowNum = sheet.getLastRowNum
println(s"Last row is $lastRowNum")
val debugRow1 = sheet.getRow(rowInsertionPoint)
val debugCell1 = debugRow1.getCell(0)
// let's get a play-by-play of what's being attempted
println(s"Current value in row $rowInsertionPoint is " +
s"${debugCell1.getNumericCellValue}")
println(s"Shifting rows $rowInsertionPoint and below down one row")
sheet.shiftRows(rowInsertionPoint, lastRowNum, 1, true, true)
val debugRow2 = sheet.getRow(rowInsertionPoint + 1)
val debugCell2 = debugRow2.getCell(0)
println(s"Current value in row ${rowInsertionPoint + 1} is now " +
s"${debugCell2.getNumericCellValue}")
println(s"Creating new row at $rowInsertionPoint in sheet")
// create the new row
val newRow = sheet.createRow(rowInsertionPoint)
// set the field ID of the row
val newCell = newRow.getCell(0, CREATE_NULL_AS_BLANK)
println(s"Inserting value $lastRowNum at $rowInsertionPoint in sheet")
newCell.setCellValue(lastRowNum)
println()
}
val workbook = new XSSFWorkbook()
val sheet = workbook.createSheet(createSafeSheetName("Test 1"))
val rowNum = 0
val cellValue = -1
println(s"Creating new row at $rowNum in sheet")
// create the new row
val row = sheet.createRow(rowNum)
// set the field ID of the row
val cell = row.getCell(0, CREATE_NULL_AS_BLANK)
println(s"Inserting value $cellValue at $rowNum in sheet")
cell.setCellValue(cellValue)
println()
// insert a second row
shiftAndInsertRow(sheet)
// and a third
shiftAndInsertRow(sheet)
workbook.write(new java.io.FileOutputStream("out/test.xlsx"))
The above code creates a spreadsheet with only two rows instead of three. What am I missing?
I think your code is fine, it looks to me like this is a bug in apache-poi. It works for me on version 3.17 but breaks if I upgrade to 4.0.0.
As far as I can tell, the row num is being updated correctly, but the reference (cell.getReference) is not.
I would suggest trying to find if the bug has already been reported here https://bz.apache.org/bugzilla/buglist.cgi?product=POI and if not, filing a new bug report.
In the meantime, you could perhaps try this workaround which seems to do the trick for me. It calls updateCellReferencesForShifting on every cell in the spreadsheet.
import scala.collection.JavaConverters._
for {
row <- sheet.rowIterator().asScala.toList
cell <- row.cellIterator().asScala.toList
} yield cell.asInstanceOf[XSSFCell].updateCellReferencesForShifting("")
Place this block of code right after your call to shiftRows. No guarantees that it's not going to break something else though, so use with caution!
I want to implement a configurable Kafka stream which reads a row of data and applies a list of transforms. Like applying functions to the fields of the record, renaming fields etc. The stream should be completely configurable so I can specify which transforms should be applied to which field. I'm using Avro to encode the Data as GenericRecords. My problem is that I also need transforms which create new columns. Instead of overwriting the previous value of the field they should append a new field to the record. This means the schema of the record changes. The solution I came up with so far is iterating over the list of transforms first to figure out which fields I need to add to the schema. I then create a new schema with the old fields and new fields combined
The list of transforms(There is always a source field which gets passed to the transform method and the result is then written back to the targetField):
val transforms: List[Transform] = List(
FieldTransform(field = "referrer", targetField = "referrer", method = "mask"),
FieldTransform(field = "name", targetField = "name_clean", method = "replaceUmlauts")
)
case class FieldTransform(field: String, targetField: String, method: String)
method to create the new schema, based on the old schema and the list of transforms
def getExtendedSchema(schema: Schema, transforms: List[Transform]): Schema = {
var newSchema = SchemaBuilder
.builder(schema.getNamespace)
.record(schema.getName)
.fields()
// create new schema with existing fields from schemas and new fields which are created through transforms
val fields = schema.getFields ++ getNewFields(schema, transforms)
fields
.foldLeft(newSchema)((newSchema, field: Schema.Field) => {
newSchema
.name(field.name)
.`type`(field.schema())
.noDefault()
// TODO: find way to differentiate between explicitly set null defaults and fields which have no default
//.withDefault(field.defaultValue())
})
newSchema.endRecord()
}
def getNewFields(schema: Schema, transforms: List[Transform]): List[Schema.Field] = {
transforms
.filter { // only select targetFields which are not in schema
case FieldTransform(field, targetField, method) => schema.getField(targetField) == null
case _ => false
}
.distinct
.map { // create new Field object for each targetField
case FieldTransform(field, targetField, method) =>
val sourceField = schema.getField(field)
new Schema.Field(targetField, sourceField.schema(), sourceField.doc(), sourceField.defaultValue())
}
}
Instantiating a new GenericRecord based on an old record
val extendedSchema = getExtendedSchema(row.getSchema, transforms)
val extendedRow = new GenericData.Record(extendedSchema)
for (field <- row.getSchema.getFields) {
extendedRow.put(field.name, row.get(field.name))
}
I tried to look for other solutions but couldn't find any example which had changing data types. It feels to me like there must be a simpler cleaner solution to handle changing Avro schemas at runtime. Any ideas are appreciated.
Thanks,
Paul
I have implemented Passing Dynamic values to your avro schema and validating union to in schema
Example :-
RestTemplate template = new RestTemplate();
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
HttpEntity<String> entity = new HttpEntity<String>(headers);
ResponseEntity<String> response = template.exchange(""+registryUrl+"/subjects/"+topic+"/versions/"+version+"", HttpMethod.GET, entity, String.class);
String responseData = response.getBody();
JSONObject jsonObject = new JSONObject(responseData); // add your json string which you will pass from postman
JSONObject jsonObjectResult = new JSONObject(jsonResult);
String getData = jsonObject.get("schema").toString();
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(getData);
GenericRecord genericRecord = new GenericData.Record(schema);
schema.getFields().stream().forEach(field->{
genericRecord.put(field.name(),jsonObjectResult.get(field.name()));
});
GenericDatumReader<GenericRecord>reader = new GenericDatumReader<GenericRecord>(schema);
boolean data = reader.getData().validate(schema,genericRecord );
How can I create Dataframe with all my json files, when after reading each file I need to add fileName as field in dataframe? it seems Variable in for loop is not recognized outside loop. How to overcome this issue?
for (jsonfilenames <- fileArray) {
var df = hivecontext.read.json(jsonfilename)
var tblLanding = df.withColumn("source_file_name", lit(jsonfilename))
}
// trying to create temp table from dataframe created in loop
tblLanding.registerTempTable("LandingTable") // ERROR here, can't resolved tblLanding
Thank in advance
Hossain
I think you are new to programming itself.
Anyways here you go.
Basically you specify the type and initialise it before loop.
var df:DataFrame = null
for (jsonfilename <- fileArray) {
df = hivecontext.read.json(jsonfilename)
var tblLanding = df.withColumn("source_file_name", lit(jsonfilename))
}
df.registerTempTable("LandingTable") // Getting ERROR here
Update
Ok you are completely new to programming, even loops.
Suppose fileArray is having values as [1.json, 2.json, 3.json, 4.json]
So, the loop actually created 4 dataframe, by reading 4 json files.
Which one you want to register as temp table.
If all of them,
var df:DataFrame = null
var count = 0
for (jsonfilename <- fileArray) {
df = hivecontext.read.json(jsonfilename)
var tblLanding = df.withColumn("source_file_name", lit(jsonfilename))
df.registerTempTable(s"LandingTable_$count")
count++;
}
And reason for df being empty before this update is, your fileArray is empty or Spark failed to read that file. Print it and check.
To query any of those registered LandingTable
val df2 = hiveContext.sql("SELECT * FROM LandingTable_0")
Update
Question has changed to making a single dataFrame from all the json files.
var dataFrame:DataFrame = null
for (jsonfilename <- fileArray) {
val eachDataFrame = hivecontext.read.json(jsonfilename)
if(dataFrame == null)
dataFrame = eachDataFrame
else
dataFrame = eachDataFrame.unionAll(dataFrame)
}
dataFrame.registerTempTable("LandingTable")
Insure, that fileArray is not empty and all json files in fileArray are having same schema.
// Create list of dataframes with source-file-names
val dfList = fileArray.map{ filename =>
hivecontext.read.json(filename)
.withColumn("source_file_name", lit(filename))
}
// union the dataframes (assuming all are same schema)
val df = dfList.reduce(_ unionAll _) // or use union if spark 2.x
// register as table
df.registerTempTable("LandingTable")
I have a spark job that (runs in spark 1.3.1) has to iterate over several keys (about 42) and process the job. Here is the structure of the program
Get the key from a map
Fetch data from hive (hadoop-yarn underneath) that is matching the key as a data frame
Process data
Write results to hive
When I run this for one key, everything works fine. When I run with 42 keys, I am getting an out of memory exception around 12th iteration. Is there a way I can clean the memory in between each iteration? Help appreciated.
Here is the high level code that I am working with.
public abstract class SparkRunnable {
public static SparkContext sc = null;
public static JavaSparkContext jsc = null;
public static HiveContext hiveContext = null;
public static SQLContext sqlContext = null;
protected SparkRunnableModel(String appName){
//get the system properties to setup the model
// Getting a java spark context object by using the constants
SparkConf conf = new SparkConf().setAppName(appName);
sc = new SparkContext(conf);
jsc = new JavaSparkContext(sc);
// Creating a hive context object connection by using java spark
hiveContext = new org.apache.spark.sql.hive.HiveContext(sc);
// sql context
sqlContext = new SQLContext(sc);
}
public abstract void processModel(Properties properties) throws Exception;
}
class ModelRunnerMain(model: String) extends SparkRunnableModel(model: String) with Serializable {
override def processModel(properties: Properties) = {
val dataLoader = DataLoader.getDataLoader(properties)
//loads keys data frame from a keys table in hive and converts that to a list
val keysList = dataLoader.loadSeriesData()
for (key <- keysList) {
runModelForKey(key, dataLoader)
}
}
def runModelForKey(key: String, dataLoader: DataLoader) = {
//loads data frame from a table(~50 col X 800 rows) using "select * from table where key='<key>'"
val keyDataFrame = dataLoader.loadKeyData()
// filter this data frame into two data frames
...
// join them to transpose
...
// convert the data frame into an RDD
...
// run map on the RDD to add bunch of new columns
...
}
}
My data frame size is under a Meg. But I create several data frames from this by selecting and joining etc. I assume all these get garbage collected once the iteration is done.
Here is configuration I am running with.
spark.eventLog.enabled:true spark.broadcast.port:7086
spark.driver.memory:12g spark.shuffle.spill:false
spark.serializer:org.apache.spark.serializer.KryoSerializer
spark.storage.memoryFraction:0.7 spark.executor.cores:8
spark.io.compression.codec:lzf spark.shuffle.consolidateFiles:true
spark.shuffle.service.enabled:true spark.master:yarn-client
spark.executor.instances:8 spark.shuffle.service.port:7337
spark.rdd.compress:true spark.executor.memory:48g
spark.executor.id: spark.sql.shuffle.partitions:700
spark.cores.max:56
Here is the exception I am getting.
Exception in thread "dag-scheduler-event-loop" java.lang.OutOfMemoryError: Java heap space
at org.apache.spark.util.io.ByteArrayChunkOutputStream.allocateNewChunkIfNeeded(ByteArrayChunkOutputStream.scala:66)
at org.apache.spark.util.io.ByteArrayChunkOutputStream.write(ByteArrayChunkOutputStream.scala:55)
at com.ning.compress.lzf.ChunkEncoder.encodeAndWriteChunk(ChunkEncoder.java:264)
at com.ning.compress.lzf.LZFOutputStream.writeCompressedBlock(LZFOutputStream.java:266)
at com.ning.compress.lzf.LZFOutputStream.write(LZFOutputStream.java:124)
at com.esotericsoftware.kryo.io.Output.flush(Output.java:155)
at com.esotericsoftware.kryo.io.Output.require(Output.java:135)
at com.esotericsoftware.kryo.io.Output.writeBytes(Output.java:220)
at com.esotericsoftware.kryo.io.Output.writeBytes(Output.java:206)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ByteArraySerializer.write(DefaultArraySerializers.java:29)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ByteArraySerializer.write(DefaultArraySerializers.java:18)
at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:568)
at org.apache.spark.serializer.KryoSerializationStream.writeObject(KryoSerializer.scala:124)
at org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:202)
at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:101)
at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29)
at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62)
at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1051)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:839)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply$mcVI$sp(DAGScheduler.scala:1042)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1039)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1038)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1038)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1390)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
Using checkpoint() or localCheckpoint() can cut the spark lineage and improve the performance of the application in iterations.