Exception in thread "main" java.lang.AbstractMethodError - scala

I am trying to access spark GRAPHX .
i have tried following code :
/**
* Created by xyz on 5/7/16.
*/
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx._
object simple {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("untitled1")
val sc = new SparkContext(conf)
val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt", true).partitionBy(PartitionStrategy.RandomVertexCut)
val triCounts = graph.triangleCount().vertices
val users = sc.textFile("graphx/data/users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}
val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
(username, tc)
}
println(triCountByUsername.collect().mkString("\n"))
}
}
val conf = new SparkConf().setAppName("untitled1")
val sc = new SparkContext(conf)
val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt", true).partitionBy(PartitionStrategy.RandomVertexCut)
val triCounts = graph.triangleCount().vertices
val users = sc.textFile("graphx/data/users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}
val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
(username, tc)
}
println(triCountByUsername.collect().mkString("\n"))
========================================
but its giving me following error :
Exception in thread "main" java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:52)
at org.apache.spark.graphx.GraphLoader$.log(GraphLoader.scala:26)
at org.apache.spark.Logging$class.logInfo(Logging.scala:59)
at org.apache.spark.graphx.GraphLoader$.logInfo(GraphLoader.scala:26)
at org.apache.spark.graphx.GraphLoader$.edgeListFile(GraphLoader.scala:84)
at simple$.main(simple.scala:18)
at simple.main(simple.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
I have a data file having following content :
2 1
4 1
1 2
6 3
7 3
7 6
6 7
3 7

Related

Spark Execution for twitter Streaming

Hi I'm new to spark and scala . I'm trying to stream some tweets through spark streaming with the following code:
object TwitterStreaming {
def main(args: Array[String]): Unit = {
if (args.length < 1) {
System.err.println("WrongUsage: PropertiesFile, [<filters>]")
System.exit(-1)
}
StreamingExamples.setStreaningLogLevels()
val myConfigFile = args(0)
val batchInterval_s = 1
val fileConfig = ConfigFactory.parseFile(new File(myConfigFile))
val appConf = ConfigFactory.load(fileConfig)
// Set the system properties so that Twitter4j library used by twitter stream
// can use them to generate OAuth credentials
System.setProperty("twitter4j.oauth.consumerKey", appConf.getString("consumerKey"))
System.setProperty("twitter4j.oauth.consumerSecret", appConf.getString("consumerSecret"))
System.setProperty("twitter4j.oauth.accessToken", appConf.getString("accessToken"))
System.setProperty("twitter4j.oauth.accessTokenSecret", appConf.getString("accessTokenSecret"))
val sparkConf = new SparkConf().setAppName("TwitterStreaming").setMaster(appConf.getString("SPARK_MASTER"))//local[2]
val ssc = new StreamingContext(sparkConf, Seconds(batchInterval_s)) // creating spark streaming context
val stream = TwitterUtils.createStream(ssc, None)
val tweet_data = stream.map(status => TweetData(status.getId, "#" + status.getUser.getScreenName, status.getText.trim()))
tweet_data.foreachRDD(rdd => {
println(s"A sample of tweets I gathered over ${batchInterval_s}s: ${rdd.take(10).mkString(" ")} (total tweets fetched: ${rdd.count()})")
})
}
}
case class TweetData(id: BigInt, author: String, tweetText: String)
My Error:
Exception in thread "main" com.typesafe.config.ConfigException$WrongType:/WorkSpace/InputFiles/application.conf: 5: Cannot concatenate object or list with a non-object-or-list, ConfigString("local") and SimpleConfigList([2]) are not compatible
at com.typesafe.config.impl.ConfigConcatenation.join(ConfigConcatenation.java:116)
can any one check the the code and tell me where I'm doing wrong?
If your config file contains:
SPARK_MASTER=local[2]
Change it to:
SPARK_MASTER="local[2]"

Saving Spark DataFrames into a database via SparkSQL works in "local[*]" but not in YARN mode

I process a set of files using Spark. The results after conversion to a Spark Dataframe should be saved to a database. The following code works when Spark runs in the "local[*]" mode. But when I run it on a cluster using YARN mode, processing ends without errors (except some these errors at the very beginning) but the database remains empty.
import java.sql.{Connection, DriverManager, Timestamp, SQLException}
import java.util.Properties
import org.apache.spark.sql.SparkSession
import scala.collection.JavaConverters._
import java.util.Calendar
import scala.collection.mutable.ListBuffer
import com.qbeats.cortex.library.{PartialDateTime, TimeExtractor}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
object CommoncrawlExtractor extends App {
var driver: String = null
var connectionString: String = null
var helper: Helper = null
var sc: SparkContext = null
var pte = sc.broadcast(new TimeExtractor)
def uncertainty = 60 * 60 * 12
case class SectionData(warcinfoID: String, recordID: String, sectionName: Int,
timestamp: Timestamp, uncertainty: Int, wordsets: Array[Array[String]])
case class Word(word: String)
case class Wordset(section_id: Int, wordset: Seq[Int])
def dropFirst(iterator: Iterator[String]): Iterator[String] = {
if (iterator.hasNext) {
iterator.next
}
iterator
}
def extractSentences(entity: String) = {
val result = ListBuffer[(String, String, Int, Timestamp, Int, Array[Array[String]])]()
val warcinfoIDPattern = """WARC-Warcinfo-ID: <urn:uuid:(.+)>""".r
val warcinfoID = warcinfoIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val recordIDPattern = """WARC-Record-ID: <urn:uuid:(.+)>""".r
val recordID = recordIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val requestTimePattern = """WARC-Date: (.+)""".r
val requestTimeString = requestTimePattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val requestTimeFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'")
val requestTime = requestTimeFormat.parse(requestTimeString)
var cal: Calendar = Calendar.getInstance()
cal.setTime(requestTime)
val referenceDate1 = new PartialDateTime(cal, null)
val contentPattern = """(?s)\r\nHTTP/1\.. 200(.+?)(\r\n){2,}(.+)WARC/1.0\r\nWARC-Type: metadata""".r
val contentString = contentPattern.findFirstMatchIn(entity).map(_ group 3).getOrElse("")
try {
val de = pte.value.extractTimes(contentString)
if (de.getEntries != null) {
for (entry <- de.getEntries.asScala) {
val pdt = entry.resolve(12 * 3600, referenceDate1)
if (pdt != null) {
val sectionWordsets = entry.getSentences.asScala.map(x => x.getTokens.asScala.toArray[String]).toArray
val sectionData = (
warcinfoID, recordID, entry.getId,
new Timestamp(pdt.secondsSinceEpoch * 1000), pdt.uncertaintyInterval.toInt, sectionWordsets
)
result += sectionData
}
}
}
} catch {
case e: Exception => println("\n" + "-" * 100 + "\n" + entity)
}
result
}
def initDB() = {
driver = "org.postgresql.Driver"
connectionString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl?user=postgres&password=postgres"
Class.forName(driver)
}
def prepareDB() = {
var conn: Connection = null
try {
conn = DriverManager.getConnection(connectionString)
val statement = conn.createStatement()
val tableResultSet = statement.executeQuery(
"""
|SELECT table_name
| FROM information_schema.tables
| WHERE table_schema='public'
| AND table_type='BASE TABLE';
""".stripMargin)
val tablesToDelete = ListBuffer[String]()
while (tableResultSet.next()) {
tableResultSet.getString("table_name") match {
case "warcinfo" => tablesToDelete.append("warcinfo")
case "record" => tablesToDelete.append("record")
case "section" => tablesToDelete.append("section")
case "word" => tablesToDelete.append("word")
case "wordset" => tablesToDelete.append("wordset")
case _ =>
}
}
for (tableName <- tablesToDelete) statement.executeUpdate("DROP TABLE " + tableName + ";")
val storedProcedureResultSet = statement.executeQuery(
"""
|SELECT proname, prosrc
|FROM pg_catalog.pg_namespace n
|JOIN pg_catalog.pg_proc p
|ON pronamespace = n.oid
|WHERE nspname = 'public';
""".stripMargin)
val storedProcedureDeletions = ListBuffer[String]()
while (storedProcedureResultSet.next()) {
storedProcedureResultSet.getString("proname") match {
case "update_word_ids" =>
storedProcedureDeletions.append("DROP FUNCTION update_word_ids();")
case _ =>
}
}
statement.executeUpdate("DROP TRIGGER IF EXISTS update_word_ids_trigger ON wordset_occurrence;")
for (storedProcedureDeletion <- storedProcedureDeletions) statement.executeUpdate(storedProcedureDeletion)
statement.executeUpdate(
"""
|CREATE TABLE warcinfo (
| warcinfo_id serial PRIMARY KEY,
| batch_name varchar NOT NULL,
| warcinfo_uuid char(36) NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE record (
| record_id serial PRIMARY KEY,
| record_uuid char(36) NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE section (
| section_id serial PRIMARY KEY,
| record_id integer NOT NULL,
| section_name integer NOT NULL,
| timestamp timestamp NOT NULL,
| uncertainty integer NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE word (
| word_id serial PRIMARY KEY,
| word varchar NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE wordset (
| section_id integer NOT NULL,
| wordset integer ARRAY
|);
""".stripMargin)
} catch {
case e: SQLException => println("exception caught: " + e)
} finally {
if (conn != null) conn.close()
}
}
def processFile(fileNames: Array[String], accessKeyId: String = "", secretAccessKey: String = ""): Unit = {
val delimiter = "WARC/1.0\r\nWARC-Type: request\r\n"
pte = sc.broadcast(new TimeExtractor)
val spark = SparkSession
.builder()
.appName("CommoncrawlExtractor")
.getOrCreate()
import spark.implicits._
val connString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl"
val prop = new Properties()
prop.put("user", "postgres")
prop.put("password", "postgres")
val entities = sc.
textFile(fileNames.mkString(",")).
mapPartitions(dropFirst).
map(delimiter + _).
flatMap(extractSentences).
map(x => SectionData(x._1, x._2, x._3, x._4, x._5, x._6)).toDF().
cache()
val warcinfo = entities.select("warcinfoID").distinct().
withColumnRenamed("warcinfoID", "warcinfo_uuid").
withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)
val record = entities.select("recordID").distinct().
withColumnRenamed("recordID", "record_uuid")
val recordWriter = record.write.mode("append")
println("Saving records.")
println(Calendar.getInstance().getTime)
recordWriter.jdbc(connString, "record", prop)
println(Calendar.getInstance().getTime)
val recordFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.record", "user" -> "postgres", "password" -> "postgres")).
load().cache()
val section = entities.
join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
select("record_id", "sectionName", "timestamp", "uncertainty").distinct().
withColumnRenamed("sectionName", "section_name")
val sectionWriter = section.write.mode("append")
println("Saving sections.")
println(Calendar.getInstance().getTime)
sectionWriter.jdbc(connString, "section", prop)
println(Calendar.getInstance().getTime)
val sectionFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.section", "user" -> "postgres", "password" -> "postgres")).
load()
val word = entities.
select("wordsets").
flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").flatten).
distinct().
map(Word(_))
val wordWriter = word.write.mode("append")
wordWriter.jdbc(connString, "word", prop)
val wordFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.word", "user" -> "postgres", "password" -> "postgres")).
load().
map(row => (row.getAs[String]("word"), row.getAs[Int]("word_id"))).
collect().
toMap
val wordsetTemp = entities.
join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
withColumnRenamed("sectionName", "section_name")
val wordset = wordsetTemp.
join(sectionFull, Seq("record_id", "section_name")).
select("section_id", "wordsets").
flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").map(x => Wordset(r.getAs[Int]("section_id"), x.map(wordFull))))
val wordsetWriter = wordset.write.mode("append")
println("Saving wordsets.")
println(Calendar.getInstance().getTime)
wordsetWriter.jdbc(connString, "wordset", prop)
println(Calendar.getInstance().getTime)
// entities.saveAsTextFile(helper.outputDirectory + "xyz")
sc.stop
}
override def main(args: Array[String]): Unit = {
if (args.length >= 2) {
initDB()
prepareDB()
helper = new Helper
val files =
if (args(0).startsWith("hdfs://")) helper.getHDFSFiles(args(0)).slice(0, args(3).toInt)
else helper.getLocalFiles(args(0))
val appName = "CommoncrawlExtractor"
val conf = new SparkConf().setAppName(appName)
if (args(0).startsWith("hdfs://")) {
conf.set("spark.executor.instances", args(1))
conf.set("spark.executor.cores", args(2))
} else conf.setMaster(args(1))
sc = new SparkContext(conf)
val delimiter = "WARC/1.0\r\nWARC-Type: request"
sc.hadoopConfiguration.set("textinputformat.record.delimiter", delimiter)
processFile(files)
}
}
}
I copied postgresql-9.4.1209.jre7.jar to /home/user/Programs/libs on every machine in the claster and use the following command (run from Spark's directory):
./bin/spark-submit --master yarn --deploy-mode client --driver-class-path /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --jars /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --conf "spark.driver.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" --conf "spark.executor.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" spark-cortex-fat.jar hdfs://LV-WS10.lviv:9000/commoncrawl 2 4 8
Please suggest how I can make it work on the cluster.
ADDED LATER:
I discovered that these lines
val warcinfo = entities.select("warcinfoID").
withColumnRenamed("warcinfoID", "warcinfo_uuid").
withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)
lead to exception
16/09/01 17:31:51 WARN scheduler.TaskSetManager: Lost task 0.1 in stage 1.0 (TID 5, LV-WS09): org.apache.spark.storage.BlockFetchException: Failed to fetch block after 1 fetch failures. Most recent failure cause:
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:565)
at org.apache.spark.storage.BlockManager.getRemoteValues(BlockManager.scala:522)
at org.apache.spark.storage.BlockManager.get(BlockManager.scala:609)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:661)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:96)
at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:95)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1203)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1325)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1211)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1190)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:194)
at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:104)
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:554)
... 31 more
Caused by: java.io.IOException: Failed to connect to ubuntu-cluster-4/192.168.100.139:36378
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:228)
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179)
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:96)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
Caused by: java.net.ConnectException: Connection refused: ubuntu-cluster-4/192.168.100.139:36378
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
... 1 more
However, some records are stored in a database.
What would you suggest?
ADDED LATER:
I looked to YARN logs on the node which stopped responding but they weren't helpful: logs.

Spark Hadoop Failed to get broadcast

Running a spark-submit job and receiving a "Failed to get broadcast_58_piece0..." error. I'm really not sure what I'm doing wrong. Am I overusing UDFs? Too complicated a function?
As a summary of my objective, I am parsing text from pdfs, which are stored as base64 encoded strings in JSON objects. I'm using Apache Tika to get the text, and trying to make copious use of data frames to make things easier.
I had written a piece of code that ran the text extraction through tika as a function outside of "main" on the data as a RDD, and that worked flawlessly. When I try to bring the extraction into main as a UDF on data frames, though, it borks in various different ways. Before I got here I was actually trying to write the final data frame as:
valid.toJSON.saveAsTextFile(hdfs_dir)
This was giving me all sorts of "File/Path already exists" headaches.
Current code:
object Driver {
def main(args: Array[String]):Unit = {
val hdfs_dir = args(0)
val spark_conf = new SparkConf().setAppName("Spark Tika HDFS")
val sc = new SparkContext(spark_conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
// load json data into dataframe
val df = sqlContext.read.json("hdfs://hadoophost.com:8888/user/spark/data/in/*")
val extractInfo: (Array[Byte] => String) = (fp: Array[Byte]) => {
val parser:Parser = new AutoDetectParser()
val handler:BodyContentHandler = new BodyContentHandler(Integer.MAX_VALUE)
val config:TesseractOCRConfig = new TesseractOCRConfig()
val pdfConfig:PDFParserConfig = new PDFParserConfig()
val inputstream:InputStream = new ByteArrayInputStream(fp)
val metadata:Metadata = new Metadata()
val parseContext:ParseContext = new ParseContext()
parseContext.set(classOf[TesseractOCRConfig], config)
parseContext.set(classOf[PDFParserConfig], pdfConfig)
parseContext.set(classOf[Parser], parser)
parser.parse(inputstream, handler, metadata, parseContext)
handler.toString
}
val extract_udf = udf(extractInfo)
val df2 = df.withColumn("unbased_media", unbase64($"media_file")).drop("media_file")
val dfRenamed = df2.withColumn("media_corpus", extract_udf(col("unbased_media"))).drop("unbased_media")
val depuncter: (String => String) = (corpus: String) => {
val r = corpus.replaceAll("""[\p{Punct}]""", "")
val s = r.replaceAll("""[0-9]""", "")
s
}
val depuncter_udf = udf(depuncter)
val withoutPunct = dfRenamed.withColumn("sentence", depuncter_udf(col("media_corpus")))
val model = sc.objectFile[org.apache.spark.ml.PipelineModel]("hdfs://hadoophost.com:8888/user/spark/hawkeye-nb-ml-v2.0").first()
val with_predictions = model.transform(withoutPunct)
val fullNameChecker: ((String, String, String, String, String) => String) = (fname: String, mname: String, lname: String, sfx: String, text: String) =>{
val newtext = text.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_fname = fname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_mname = mname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_lname = lname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_sfx = sfx.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val name_full = new_fname.concat(new_mname).concat(new_lname).concat(new_sfx)
val c = name_full.r.findAllIn(newtext).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val fullNameChecker_udf = udf(fullNameChecker)
val stringChecker: ((String, String) => String) = (term: String, text: String) => {
val termLower = term.replaceAll("""[\p{Punct}]""", "").toLowerCase
val textLower = text.replaceAll("""[\p{Punct}]""", "").toLowerCase
val c = termLower.r.findAllIn(textLower).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val stringChecker_udf = udf(stringChecker)
val stringChecker2: ((String, String) => String) = (term: String, text: String) => {
val termLower = term takeRight 4
val textLower = text
val c = termLower.r.findAllIn(textLower).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val stringChecker2_udf = udf(stringChecker)
val valids = with_predictions.withColumn("fname_valid", stringChecker_udf(col("first_name"), col("media_corpus")))
.withColumn("lname_valid", stringChecker_udf(col("last_name"), col("media_corpus")))
.withColumn("fname2_valid", stringChecker_udf(col("first_name_2"), col("media_corpus")))
.withColumn("lname2_valid", stringChecker_udf(col("last_name_2"), col("media_corpus")))
.withColumn("camt_valid", stringChecker_udf(col("chargeoff_amount"), col("media_corpus")))
.withColumn("ocan_valid", stringChecker2_udf(col("original_creditor_account_nbr"), col("media_corpus")))
.withColumn("dpan_valid", stringChecker2_udf(col("debt_provider_account_nbr"), col("media_corpus")))
.withColumn("full_name_valid", fullNameChecker_udf(col("first_name"), col("middle_name"), col("last_name"), col("suffix"), col("media_corpus")))
.withColumn("full_name_2_valid", fullNameChecker_udf(col("first_name_2"), col("middle_name_2"), col("last_name_2"), col("suffix_2"), col("media_corpus")))
valids.write.mode(SaveMode.Overwrite).format("json").save(hdfs_dir)
}
}
Full stack trace starting with error:
16/06/14 15:02:01 WARN TaskSetManager: Lost task 0.0 in stage 4.0 (TID 53, hdpd11n05.squaretwofinancial.com): org.apache.spark.SparkException: Task failed while writing rows.
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:272)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_58_piece0 of broadcast_58
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1222)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9$$anonfun$apply$7.apply(CountVectorizer.scala:222)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9$$anonfun$apply$7.apply(CountVectorizer.scala:221)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9.apply(CountVectorizer.scala:221)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9.apply(CountVectorizer.scala:218)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalExpr43$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:51)
at org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:263)
... 8 more
Caused by: org.apache.spark.SparkException: Failed to get broadcast_58_piece0 of broadcast_58
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1219)
... 25 more
I encountered a similar error.
It turns out to be caused by the broadcast usage in CounterVectorModel. Following is the detailed cause in my case:
When model.transform() is called , the vocabulary is broadcasted and saved as an attribute broadcastDic in model implicitly. Therefore, if the CounterVectorModel is saved after calling model.transform(), the private var attribute broadcastDic is also saved. But unfortunately, in Spark, broadcasted object is context-sensitive, which means it is embedded in SparkContext. If that CounterVectorModel is loaded in a different SparkContext, it will fail to find the previous saved broadcastDic.
So either solution is to prevent calling model.transform() before saving the model, or clone the model by method model.copy().
For anyone coming across this, it turns out the model I was loading was malformed. I found out by using spark-shell in yarn-client mode and stepping through the code. When I tried to load the model it was fine, but running it against the datagram (model.transform) through errors about not finding a metadata directory.
I went back and found a good model, ran against that and it worked fine. This code is actually sound.

Scala : java.lang.IllegalArgumentException: requirement failed: nonEmpty input

I have written a code to process tiff files , do map algebra operations and store the result back as tiff file. When am running the code am getting this error. I am passing only one argument to the code which is a file path. What could be causing the error?
Error :
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: nonEmpty input
at scala.Predef$.require(Predef.scala:233)
at geotrellis.spark.stitch.TileLayoutStitcher$.stitch(StitchRDDMethods.scala:21)
at geotrellis.spark.stitch.SpatialTileLayoutRDDMethods.stitch(StitchRDDMethods.scala:42)
at RasterData.RasterOperations.MapAlgebraOperations$.readHdfsTiles(MapAlgebraOperations.scala:97)
at RasterData.RasterOperations.MapAlgebraOperations$.main(MapAlgebraOperations.scala:27)
at RasterData.RasterOperations.MapAlgebraOperations.main(MapAlgebraOperations.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:685)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
code:
object MapAlgebraOperations {
def main(args: Array[String]) : Unit = {
readHdfsTiles(args(0))
}
def readHdfsTiles(readpath: String): TileLayerRDD[SpatialKey] = {
implicit val sc = SparkUtils.createSparkContext("MapAlgebra")
//layer name for reading the RasterRDD
val nlcd1 = new String("nlcd1")
val nlcd2 = new String("nlcd2")
//Hadoop Tiles Location
val HdfsPath = new Path(readpath)
val reader = HadoopLayerReader(HdfsPath)(sc)
val rastereurope = reader.read[SpatialKey, Tile, TileLayerMetadata[SpatialKey]](LayerId(nlcd1, 1))
val rasterasia = reader.read[SpatialKey, Tile, TileLayerMetadata[SpatialKey]](LayerId(nlcd2, 1))
// Local Operations on Tiles
// Add 100 to each cell on raster Europe
val AddRasterEurope: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ localAdd 100 }
// Subtract 5 to each cell on raster Asia
val SubtractRasterAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rasterasia.withContext { _ localSubtract 1 }
// Union
val EuropeUnionAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ union(SubtractRasterAsia) }
//Intersection
val EuropeIntersectAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ intersection(rasterasia) }
//focal operations on Europe
val focalEurope = rastereurope.focalMean(Circle(5))
//Chain Focal Operations
val focalEuropeChain = focalEurope.focalMean(Circle(5))
//LeftOuterJoin
//val LeftOuterJoin = rastereurope.leftOuterJoin(rasterasia).updateValues(Add(_, _))
//Crop India from Asia
//Spatial Join
//val EuropeSpatialJoinAsia = rastereurope.spatialJoin(rasterasia)
//Define new layer
//val layerid1 = new LayerId("newnlcd1",1)
//val layerid2 = new LayerId("newnlcd2",1)
//val layerid3 = new LayerId("newnlcd3",1)
//Writing the resultant RDD to HDFS
//val writer = HadoopLayerWriter(HdfsPath)(sc)
//writer.write(layerid1, rddWithContext, ZCurveKeyIndexMethod)
//writer.write(layerid2, rddWithContext1, ZCurveKeyIndexMethod)
//writer.write(layerid3, rddWithContext2, ZCurveKeyIndexMethod)
//Convert the data into GeoTiff
val addrasterEurope = AddRasterEurope.stitch
GeoTiff(addrasterEurope, AddRasterEurope.metadata.crs).write("/home/brillio/tiffs/AddRasterEurope.tiff")
val subrasterAsia = SubtractRasterAsia.stitch
GeoTiff(subrasterAsia, SubtractRasterAsia.metadata.crs).write("/home/brillio/tiffs/SubtractRasterAsia.tiff")
val rasterUnion = EuropeUnionAsia.stitch
GeoTiff(rasterUnion, EuropeUnionAsia.metadata.crs).write("/home/brillio/tiffs/EuropeUnionAsia.tiff")
val rasterIntersect = EuropeIntersectAsia.stitch
GeoTiff(rasterIntersect, EuropeIntersectAsia.metadata.crs).write("/home/brillio/tiffs/EuropeIntersectAsia.tiff")
val rasterfocal = focalEurope.stitch
GeoTiff(rasterfocal, focalEurope.metadata.crs).write("/home/brillio/tiffs/rasterfocal.tiff")
val rasterChainfocal = focalEuropeChain.stitch
GeoTiff(rasterChainfocal, focalEuropeChain.metadata.crs).write("/home/brillio/tiffs/rasterChainfocal.tiff")
//val EuropeLeftOuterAsia = LeftOuterJoin.stitch
//GeoTiff(EuropeLeftOuterAsia, LeftOuterJoin.metadata.crs).write("/home/brillio/tiffs/rasterChainfocal.tiff")
//val rasterres3 = EuropeSpatialJoinAsia.stitch
//GeoTiff(rasterres3, EuropeSpatialJoinAsia.metadata.crs).write("/home/brillio/tiffs/EuropeSpatialJoinAsia.tiff")
AddRasterEurope
}
}

Duplicate class Definition found error in spark

I am using spark streaming with kinesis and facing this exception when I run my code
here is my code
System.setProperty("AWS_ACCESS_KEY_ID", KinesisProperties.AWS_ACCESS_KEY_ID)
System.setProperty("AWS_SECRET_KEY", KinesisProperties.AWS_SECRET_KEY)
var kinesisClient: AmazonKinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
kinesisClient.setEndpoint(KinesisProperties.KINESIS_ENDPOINT_URL,
KinesisProperties.KINESIS_SERVICE_NAME,
KinesisProperties.KINESIS_REGION_ID)
val numShards = kinesisClient.describeStream( KinesisProperties.MY_STREAM_NAME)
.getStreamDescription().getShards().size()
val numStreams = numShards
val ssc = StreamingHelper.getStreamingInstance(new Duration(KinesisProperties.KINESIS_CHECKPOINT_INTERVAL))
ssc.addStreamingListener(new MyStreamListener)
val kinesisStreams = (0 until numStreams).map { i =>
KinesisUtils.createStream(ssc, KinesisProperties.MY_STREAM_NAME,
KinesisProperties.KINESIS_ENDPOINT_URL,
new Duration(KinesisProperties.KINESIS_CHECKPOINT_INTERVAL), InitialPositionInStream.TRIM_HORIZON,
null)
}
/* Union all the streams */
val unionStreams = ssc.union(kinesisStreams)
val tmp_stream = unionStreams.map(byteArray => new String(byteArray))
val data = tmp_stream.window(Seconds(KinesisProperties.WINDOW_INTERVAL), Seconds(KinesisProperties.SLIDING_INTERVAL))
data.foreachRDD((rdd: RDD[String], time: Time) => {
if (rdd.take(1).size == 1) {
rdd.saveAsTextFile(KinesisProperties.Sink + time.milliseconds)
}
})
ssc.start()
ssc.awaitTermination()
And face following exception
java.lang.LinkageError: loader (instance of org/apache/spark/executor/ChildExecutorURLClassLoader$userClassLoader$): attempted duplicate class definition for name: "com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream"