I am following the documentation here
https://docs.databricks.com/user-guide/faq/cython.html
I can get this short sample code to work, but when incorporating into my longer code, I get this
File "./jobs.zip/jobs/util.py", line 51, in wrapped
cython_function_ = getattr(__import__(module), method)
File "/usr/local/lib64/python2.7/site-packages/pyximport/pyximport.py", line 458, in load_module
language_level=self.language_level)
File "/usr/local/lib64/python2.7/site-packages/pyximport/pyximport.py", line 233, in load_module
exec("raise exc, None, tb", {'exc': exc, 'tb': tb})
File "/usr/local/lib64/python2.7/site-packages/pyximport/pyximport.py", line 216, in load_module
mod = imp.load_dynamic(name, so_path)
ImportError: Building module cython_util failed: ['ImportError: /home/.pyxbld/lib.linux-x86_64-2.7/cython_util.so: file too short\n']
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Yet, after this error the spark program seems to keep running. Any one knows what the heck is this complain? How can the .so file be too short? Can I ignore it and move on since the program continues to run.
Related
I am trying to write OCR code using spark and pytesseract and I am running into pytesseract module not found error even though pytesseract module is installed.
import pytesseract
from PIL import Image
path='/XXXX/JupyterLab/notebooks/testdir'
rdd = sc.binaryFiles(path)
rdd.keys().collect()
-->['file:XXX/JupyterLab/notebooks/testdir/copy.png']
input=rdd.keys().map(lambda s: s.replace("file:",""))
def read(x):
import pytesseract
image=Image.open(x)
text=pytesseract.image_to_open(image)
return text
newRdd= input.map(lambda x : read(x))
newRdd.collect()
"On newRdd.collect() I get following error"
ModuleNotFoundError: No module named 'pytesseract'at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$16.apply(RDD.scala:960)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$16.apply(RDD.scala:960)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2111)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2111)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:420)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I am not sure how can I pass the rdd.key() which holds the image path to pytesseract.image_to_String() using Image.open().
Thank you.
My error was resolved by adding
sc.addPyFile('/pathto........../pytesseract/pytesseract.py')
I am running below code on pycharm , this code is working properly if i provide --jars through command prompt
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pySparksqLite_test").\
config('spark.jars.packages', "C:/jars/DataVisualization/sqlite-jdbc-3.20.0.jar").getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")
df_flight_info = spark.read.format("jdbc").option(url="jdbc:sqlite:C:/sqlite-tools-win32-x86-3290000/my-sqlite.db",
driver="org.sqlite.JDBC",
dbtable="(select DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count from flight_info)")\
.load()
but with pycharm i am getting below error
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Provided Maven Coordinates must be in the form 'groupId:artifactId:version'. The coordinate provided is: C:/Users/jars/sqlite-jdbc-3.20.0.jar
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.deploy.SparkSubmitUtils$$anonfun$extractMavenCoordinates$1.apply(SparkSubmit.scala:1000)
at org.apache.spark.deploy.SparkSubmitUtils$$anonfun$extractMavenCoordinates$1.apply(SparkSubmit.scala:998)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.deploy.SparkSubmitUtils$.extractMavenCoordinates(SparkSubmit.scala:998)
at org.apache.spark.deploy.SparkSubmitUtils$.resolveMavenCoordinates(SparkSubmit.scala:1220)
at org.apache.spark.deploy.DependencyUtils$.resolveMavenDependencies(DependencyUtils.scala:49)
at org.apache.spark.deploy.SparkSubmit$.prepareSubmitEnvironment(SparkSubmit.scala:350)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:170)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Traceback (most recent call last):
File "C:/...../proj1/pySparksqLite.py", line 4, in <module>
config('spark.jars.packages', "C:/Users/jars/sqlite-jdbc-3.20.0.jar").getOrCreate()
File "C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\sql\session.py", line 173, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\context.py", line 331, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\context.py", line 115, in __init__
SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
File "C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\context.py", line 280, in _ensure_initialized
SparkContext._gateway = gateway or launch_gateway(conf)
File "C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\java_gateway.py", line 95, in launch_gateway
raise Exception("Java gateway process exited before sending the driver its port number")
Exception: Java gateway process exited before sending the driver its port number
Process finished with exit code 1
I have also tried providing jar file path through environment variable and setting it through os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars C:/Users/jars/sqlite-jdbc-3.27.2.jar'
but even this is not working
The equivalent of the --jars submitting parameter is spark.jars which allows you to specify local jars to transfer them to the cluster. You used spark.jars.packages which allows you to download packages from maven by specifying the maven coordinates. The submitting parameter equivalent of that is --packages.
Have a look at the documentation for more information: configuration and submitting parameters
I am trying to execute a python script from Scala Spark job (Spark 2.3) like below
val pyScript = "wasb://scripts#myAccount.blob.core.windows.net/print.py"
val pyScriptName = "print.py"
spark.sparkContext.addFile(pyScript)
val ipData = sc.parallelize(List("abc","def","ghi"))
val opData = ipData.pipe(org.apache.spark.SparkFiles.get(pyScriptName))
opData.foreach(println)
However I get the below exception. Any ideas what could be wrong?
org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 2.0 failed 4 times, most recent failure: Lost task 2.3 in stage 2.0 (TID 141, wn4-novakv.oetevw42cdoe3jls1dzdeclktg.ex.internal.cloudapp.net, executor 2): java.io.IOException: Cannot run program "/mnt/resource/hadoop/yarn/local/usercache/livy/appcache/application_1539905356890_0003/spark-5da59a08-a6a4-443d-b3b1-c31643e195c5/userFiles-e3ca8a1b-44f6-4804-9c95-625b1742fb77/print.py": error=2, No such file or directory
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
at org.apache.spark.rdd.PipedRDD.compute(PipedRDD.scala:111)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: error=2, No such file or directory
I am trying to install mongodb spark connector. Everything goes well, however when I run the spark code, the following issue comes in. Please help.
MongoDBConnector - 2.2.2
Spark - 2.2.0
Mongodb Version - 3.6
18/05/08 11:18:39 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
Traceback (most recent call last):
File "/home/cisco/spark-mongo-test.py", line 7, in <module>
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
File "/home/cisco/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 165, in load
File "/home/cisco/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/home/cisco/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/home/cisco/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o39.load.
: java.lang.ClassNotFoundException: Failed to find data source: com.mongodb.spark.sql.DefaultSource. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:549)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:301)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21$$anonfun$apply$12.apply(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21$$anonfun$apply$12.apply(DataSource.scala:533)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21.apply(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$21.apply(DataSource.scala:533)
at scala.util.Try.orElse(Try.scala:84)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:53
I think you should add this line to [spark-home]/conf/spark-defaults.conf:
spark.jars.packages org.mongodb.spark:mongo-spark-connector_2.11:2.3.0
For me the issue was that I did not have the jar file that included the mongo-spark-connector_2.11.jar file in my [spark-home]/jars directory. You can download them here:
https://jar-download.com/?search_box=mongo%20spark%20connector
I am trying to use iced tea / javaws to authenticate as user at the USPTO (patent/trademark office). I download the jnlp file and try to use it with javaws but hit a 'name too long' error - does anyone know how to work around this?
jeremy#jeremy-Blade ~/Downloads $ javaws uspto-auth.authenticate.jnlp
java.io.FileNotFoundException: /home/jeremy/.cache/icedtea-web/cache/0/https/efs.uspto.gov/TruePassWebStart/uspto-auth.authenticate.jnlp.q_SlNFU1NJT05JRD1GN0Q5ODBCNzhCODI3QTQwMkZEOEI0ODU4M0M5OTYzMi5wcm9kX3RwdG9tY2F0MjE1X2p2bTsgbXl1c3B0b19zaWduaW49aW5pdGlhdGVkOyBmc3Iucj0lN0IlMjJkJTIyJTNBOTAlMkMlMjJpJTIyJTNBJTIyZDU1NDUwNS01MTYyNjUxMy04NTMwLTJkNmItYTJmMmUlMjIlMkMlMjJlJTIyJTNBMTUxNTk5ODY2NDIxNCU3RDsgX191dG1hPTIxNzEyMjUwNS4yMDQ3MDM2NjA0LjE1MTUzOTM2MjUuMTUxNTU3NjYzOS4xNTE2ODAwMTU2LjI7IF9fdXRtej0yMTcxMjI1MDUuMTUxNTU3NjYzOS4xLjEudXRtY3NyPShkaXJlY3QpfHV0bWNjbj0oZGlyZWN0KXx1dG1jbWQ9KG5vbmUpOyBmc3Iucz0lN0IlMjJ2MSUyMiUzQS0xJTJDJTIydjIlMjIlM0EtMiUyQyUyMnJpZCUyMiUzQSUyMmQwNDgwMTItNTcwODgwODUtOTVhYy1lMmE4LTE0Njg3JTIyJTJDJTIycnUlMjIlM0ElMjJodHRwcyUzQSUyRiUyRnd3dy51c3B0by5nb3YlMkYlMjIlMkMlMjJyJTIyJTNBJTIyd3d3LnVzcHRvLmdvdiUyMiUyQyUyMnN0JTIyJTNBJTIyJTIyJTJDJTIyY3AlMjIlM0ElN0IlMjJQYXRlbnRzJTIyJTNBJTIyWSUyMiUyQyUyMlRyYWRlbWFya3MlMjIlM0ElMjJOJTIyJTJDJTIySVBfTGF3X1BvbGljeV9JbmZvJTIyJTNBJTIyTiUyMiUyQyUyMlZlbmRvcl9JbmZvJTIyJTNBJTIyTiUyMiUyQyUyMkF2YWlsX0VsZWN0cm9uaWNfQml6X1N5cyUyMiUzQSUyMk4lMjIlMkMlMjJJbnRsX0FjdGl2aXRpZXMlMjIlM0ElMjJOJTIyJTJDJTIyVGVzdGltb255X1NwZWVjaGVzJTIyJTNBJTIyTiUyMiU3RCUyQyUyMnRvJTIyJTNBMy4yJTJDJTIyYyUyMiUzQSUyMmh0dHBzJTNBJTJGJTJGd3d3LnVzcHRvLmdvdiUyRnBhdGVudHMtYXBwbGljYXRpb24tcHJvY2VzcyUyRmFwcGx5aW5nLW9ubGluZSUyRmFib3V0LWVmcy13ZWIlMjIlMkMlMjJwdiUyMiUzQTIlMkMlMjJsYyUyMiUzQSU3QiUyMmQxJTIyJTNBJTdCJTIydiUyMiUzQTIlMkMlMjJzJTIyJTNBdHJ1ZSU3RCU3RCUyQyUyMmNkJTIyJTNBMSUyQyUyMmYlMjIlM0ExNTE3NTg2MTE1MzQ2JTJDJTIyc2QlMjIlM0ExJTdEOyBfZ2E9R0ExLjIuMjA0NzAzNjYwNC4xNTE1MzkzNjI1OyBfZ2lkPUdBMS4yLjE2MDM3NTE1MzkuMTUxNzU4NjExODsgRW50cnVzdFRydWVQYXNzUmVkaXJlY3RVcmw9Imh0dHBzOi8vZWZzLnVzcHRvLmdvdi9FRlNXZWJVSVJlZ2lzdGVyZWQvRUZTV2ViUmVnaXN0ZXJlZCI_.info (File name too long)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at java.io.FileOutputStream.<init>(FileOutputStream.java:162)
at net.sourceforge.jnlp.util.PropertiesFile.store(PropertiesFile.java:167)
at net.sourceforge.jnlp.cache.CacheEntry.store(CacheEntry.java:225)
at net.sourceforge.jnlp.cache.ResourceDownloader.initializeFromURL(ResourceDownloader.java:193)
at net.sourceforge.jnlp.cache.ResourceDownloader.initializeOnlineResource(ResourceDownloader.java:128)
at net.sourceforge.jnlp.cache.ResourceDownloader.initializeResource(ResourceDownloader.java:118)
at net.sourceforge.jnlp.cache.ResourceDownloader.run(ResourceDownloader.java:107)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)