I am running pyspark from an Azure Machine Learning notebook. I am trying to move a file using the dbutil module.
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
def get_dbutils(spark):
try:
from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)
except ImportError:
import IPython
dbutils = IPython.get_ipython().user_ns["dbutils"]
return dbutils
dbutils = get_dbutils(spark)
dbutils.fs.cp("file:source", "dbfs:destination")
I got this error:
ModuleNotFoundError: No module named 'pyspark.dbutils'
Is there a workaround for this?
Here is the error in another Azure Machine Learning notebook:
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-183f003402ff> in get_dbutils(spark)
4 try:
----> 5 from pyspark.dbutils import DBUtils
6 dbutils = DBUtils(spark)
ModuleNotFoundError: No module named 'pyspark.dbutils'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-1-183f003402ff> in <module>
10 return dbutils
11
---> 12 dbutils = get_dbutils(spark)
<ipython-input-1-183f003402ff> in get_dbutils(spark)
7 except ImportError:
8 import IPython
----> 9 dbutils = IPython.get_ipython().user_ns["dbutils"]
10 return dbutils
11
KeyError: 'dbutils'
Related
Thanks a lot for any help.
My goal is to save a trained model in XML format and Im really stragling with this error and warnings
---------------------------------------------------------------------------
Exception in thread "Thread-4" java.lang.ExceptionInInitializerError
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Class.java:398)
at py4j.reflection.CurrentThreadClassLoadingStrategy.classForName(CurrentThreadClassLoadingStrategy.java:40)
at py4j.reflection.ReflectionUtil.classForName(ReflectionUtil.java:51)
at py4j.reflection.TypeUtil.forName(TypeUtil.java:243)
at py4j.commands.ReflectionCommand.getUnknownMember(ReflectionCommand.java:175)
at py4j.commands.ReflectionCommand.execute(ReflectionCommand.java:87)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: Expected Apache Spark ML version 3.1, got version 3.2 (3.2.0)
at org.jpmml.sparkml.ConverterFactory.checkVersion(ConverterFactory.java:114)
at org.jpmml.sparkml.PMMLBuilder.init(PMMLBuilder.java:481)
at org.jpmml.sparkml.PMMLBuilder.<clinit>(PMMLBuilder.java:545)
... 10 more
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
/tmp/ipykernel_20251/3496938591.py in <module>
----> 1 pmmlBuilder = PMMLBuilder(sc, df_train, rfModel)
~/.local/lib/python3.8/site-packages/pyspark2pmml/__init__.py in __init__(self, sc, df, pipelineModel)
10 javaSchema = javaDf.schema.__call__()
11 javaPipelineModel = pipelineModel._to_java()
---> 12 javaPmmlBuilderClass = sc._jvm.org.jpmml.sparkml.PMMLBuilder
13 if(not isinstance(javaPmmlBuilderClass, JavaClass)):
14 raise RuntimeError("JPMML-SparkML not found on classpath")
~/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __getattr__(self, name)
1647 answer[proto.CLASS_FQN_START:], self._gateway_client)
1648 else:
-> 1649 raise Py4JError("{0} does not exist in the JVM".format(new_fqn))
1650
1651
Py4JError: org.jpmml.sparkml.PMMLBuilder does not exist in the JVM
My code is the folowing:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("SparkApp_ETL_ML").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.getOrCreate()
import pandas as pd
df=pd.read_parquet("https://s3.eu-de.cloud-object-storage.appdomain.cloud/cloud-object-storage-yy-cos-standard-js4/data.parquet")
sdf = spark.createDataFrame(df)
from pyspark.sql.types import DoubleType
sdf = sdf.withColumn("x", sdf.x.cast(DoubleType()))
sdf = sdf.withColumn("y", sdf.y.cast(DoubleType()))
sdf = sdf.withColumn("z", sdf.z.cast(DoubleType()))
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
input_columns = ["x", "y", "z"] # input columns to consider
train, test = sdf.randomSplit([0.8, 0.2], seed=1)
indexer = StringIndexer(inputCol="class", outputCol="label")
vectorAssembler = VectorAssembler(inputCols=input_columns, outputCol="features")
normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer])
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy").setPredictionCol("prediction"). \
setLabelCol("label")
df_train = pipeline.fit(train).transform(train)
df_test = pipeline.fit(test).transform(test)
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='features_norm', labelCol='label', maxDepth=20, numTrees=7, seed=1)
rfModel = rf.fit(df_train)
from pyspark2pmml import PMMLBuilder
model_target = "HMP_frModel.xml"
pmmlBuilder = PMMLBuilder(sc, df_train, rfModel)
All works well till the last line in code.
I tried all solutions i found on the internet but unfortunatly without success.
I am working with jupyter notebook not anaconda and installed pyspark with pip and I added those variables in the .bashrc
export PATH=$PATH:~/.local/bin
export SPARK_HOME=~/.local/lib/python3.8/site-packages/pyspark
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.10.9.2-src.zip
export PATH=$SPARK_HOME/bin:$SPARK_HOME/python:$PATH
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
I also downloaded those jar files jpmml-sparkml-executable-1.7.2.jar jpmml-sparkml-executable-1.8.0.jar and put them in this directory ~/.local/lib/python3.8/site-packages/pyspark/jars
I use Anaconda and jupyter notebook.
I installed Quantlib in an environment.
I run the following piece of code and get an AttributeError
import QuantLib as ql
calculation_date = ql.date(9,1,2008)
ql.Settings.instance().evaluationDate = calculation_date
The following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-7-3ded7a4b7cb5> in <module>()
----> 1 calculation_date = ql.date(9,1,2004)
2 ql.Settings.instance().evaluationDate = calculation_date
AttributeError: module 'QuantLib' has no attribute 'date'
How can I fix this problem?
I think it requieres caps? You should use Date instead of date
import QuantLib as ql
calculation_date = ql.Date(9,1,2008)
ql.Settings.instance().evaluationDate = calculation_date
I am running the FPGrowth algorithm using pyspark in python3.6 using jupyter notebook. When I am trying to save the association rules output of rules generated is huge. So I want to limit the number of consequent. Here is the code which I have tried. I also changed the spark context parameters.
Maximum Pattern Length fpGrowth (Apache) PySpark
from pyspark.sql.functions import col, size
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import Row
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf
conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
.set('spark.executor.memory', '100G')
.set('spark.driver.memory', '400G')
.set('spark.driver.maxResultSize', '200G'))
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
R = Row('ID', 'items')
df=spark.createDataFrame([R(i, x) for i, x in enumerate(lol)])
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.7, minConfidence=0.9)
model = fpGrowth.fit(df)
ar=model.associationRules.where(size(col('antecedent')) == 2).where(size(col('cosequent')) == 1)
ar.cache()
ar.toPandas().to_csv('output.csv')
It gives an error
TypeError Traceback (most recent call last)
<ipython-input-1-f90c7a9f11ae> in <module>
---> 73 ar=model.associationRules.where(size(col('antecedent')) ==
2).where(size(col('consequent')) == 1)
TypeError: 'str' object is not callable
Can someone help me to solve the issue.
Here lol is list of list of transactions: [['a','b'],['c','a','e']....]
Python: 3.6.5
Pyspark
Windows 10
From the above discussion and following this link, it helped me to resolve the problem.
'str' object is not callable TypeError
import pyspark.sql.functions as func
model.associationRules.where(func.size(func.col('antecedent')) == 1).where(func.size(func.col('consequent')) == 1).show()
I keep getting "NameError: name 're' is not defined", even though I have already imported re in my code AND the built in function pat_count() defined in library_s19_week2.py. I tried all the possible places to import re but none seemed working. Please help!
My code:
import re
hash_pat = re.compile(r'#\w+')
hash_counter = pat_count(hash_pat)
tweet_table['hash_count'] = tweet_table.apply(lambda row: hash_counter(row['tweet']), axis=1)
Traceback for the error:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-93-1880eb903ae9> in <module>()
10
11 hash_pat = re.compile(r'#\w+')
---> 12 hash_counter = pat_count(hash_pat)
13 tweet_table['hash_count'] = tweet_table.apply(lambda row: hash_counter(row['tweet']), axis=1)
14
/content/library_s19_week2.py in pat_count(pattern)
95 def pat_count(pattern):
96 import re
---> 97
98 pat = re.compile(pattern)
99
NameError: name 're' is not defined
I found my bug:
hash_pat = re.compile(r'#\w+') should be hash_pat = r'#\w+.
As seen in the function pat_count() in the traceback, hash_pat is an input to re.compile().
Python 2.7.10 / Anaconda / windows 8.1
I have strange issue, the following code works on one solution file in the same working directory.
But when I copy call the exact same code to my sheet. I get this error, so I have no idea to fix this.
Here's the code:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
x = np.zeros(20)
x[:5] = 10
x[5:15] = np.arange(12,31,2)
x[15:] = 30
plt.plot(x)
plt.plot([4,4],[8,32],'k--')
plt.plot([14,14],[8,32],'k--')
plt.ylim(8,32)
Traceback (most recent call last)<ipython-input-65-6b573104eb1d> in <module>()
6 plt.plot([4,4],[8,32],'k--')
7 plt.plot([14,14],[8,32],'k--')
----> 8 plt.ylim(8,32)
TypeError: 'int' object is not callable