Pyspark Cosine similarity Invalid argument, not a string or column - pyspark

I am trying to calculate cosine distances of 2 title and headline columns via using pre-trained bert model just like below
title
headline
title_array
headline_array
arrayed
Dance Gavin Dance bass player Tim Feerick dead at 34
Prince Harry and Meghan Markle make secret visit to see Queen ahead of Invictus Games
["Dance Gavin Dance bass player Tim Feerick dead at 34"]
["Prince Harry and Meghan Markle make secret visit to see Queen ahead of Invictus Games"]
["Dance Gavin Dance bass player Tim Feerick dead at 34", "Prince Harry and Meghan Markle make secret visit to see Queen ahead of Invictus Games"]
# downloading bert
model = SentenceTransformer('bert-base-nli-mean-tokens')
from sentence_transformers import SentenceTransformer
import numpy as np
from pyspark.sql.types import FloatType
import pyspark.sql.functions as f
#udf(FloatType())
def cosine_similarity(sentence_embeddings, ind_a, ind_b):
s = sentence_embeddings
return np.dot(s[ind_a], s[ind_b]) / (np.linalg.norm(s[ind_a]) * np.linalg.norm(s[ind_b]))
#udf_bert = udf(cosine_similarity, FloatType())
''''
s0 = "our president is a good leader he will not fail"
s1 = "our president is not a good leader he will fail"
s2 = "our president is a good leader"
s3 = "our president will succeed"
sentences = [s0, s1, s2, s3]
sentence_embeddings = model.encode(sentences)
s = sentence_embeddings
print(f"{s0} <--> {s1}: {udf_bert(sentence_embeddings, 0, 1)}")
print(f"{s0} <--> {s2}: {cosine_similarity(sentence_embeddings, 0, 2)}")
print(f"{s0} <--> {s3}: {cosine_similarity(sentence_embeddings, 0, 3)}")
'''''
test_df = test_df.withColumn("Similarities", (cosine_similarity(model.encode(test_df.arrayed), 0, 1))
As we see from the example , algorithm takes concatenation of two array of strings and calculate distances of cosine among them.
When I only run the algorithm/function with the sample texts commented out , it is working. But when I try to apply it into my dataframe via registering as a udf and call with dataframe I am facing with the error below:
TypeError Traceback (most recent call last)
<command-757165186581086> in <module>
26 '''''
27
---> 28 test_df = test_df.withColumn("Similarities", f.lit(cosine_similarity(model.encode(test_df.arrayed), 0, 1)))
/databricks/spark/python/pyspark/sql/udf.py in wrapper(*args)
197 #functools.wraps(self.func, assigned=assignments)
198 def wrapper(*args):
--> 199 return self(*args)
200
201 wrapper.__name__ = self._name
/databricks/spark/python/pyspark/sql/udf.py in __call__(self, *cols)
177 judf = self._judf
178 sc = SparkContext._active_spark_context
--> 179 return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
180
181 # This function is for improving the online help system in the interactive interpreter.
/databricks/spark/python/pyspark/sql/column.py in _to_seq(sc, cols, converter)
60 """
61 if converter:
---> 62 cols = [converter(c) for c in cols]
63 return sc._jvm.PythonUtils.toSeq(cols)
64
/databricks/spark/python/pyspark/sql/column.py in <listcomp>(.0)
60 """
61 if converter:
---> 62 cols = [converter(c) for c in cols]
63 return sc._jvm.PythonUtils.toSeq(cols)
64
/databricks/spark/python/pyspark/sql/column.py in _to_java_column(col)
44 jcol = _create_column_from_name(col)
45 else:
---> 46 raise TypeError(
47 "Invalid argument, not a string or column: "
48 "{0} of type {1}. "
TypeError: Invalid argument, not a string or column: [-0.29246375 0.02216947 0.610355 -0.02230968 0.61386955 0.15291359]

The input of a UDF is a Column or a column name, that's why Spark is complaining Invalid argument, not a string or column: [-0.29246375 0.02216947 0.610355 -0.02230968 0.61386955 0.15291359]. You'll need to pass arrayed only, and refer model inside your UDF. Something like this
#udf(FloatType())
def cosine_similarity(sentence_embeddings, ind_a, ind_b):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
s = model.encode(arrayed)
return np.dot(s[ind_a], s[ind_b]) / (np.linalg.norm(s[ind_a]) * np.linalg.norm(s[ind_b]))
test_df = test_df.withColumn("Similarities", (cosine_similarity(test_df.arrayed, 0, 1))

Related

Is there a way to use spark MLLib CrossValidator without parameter grid?

I want to use cross-validation instead of the normal validation set approach just as a means to get a better estimate of the test error rate. I am using spark-MLLib Dataframe based API. However if I run the following code -
cv = tuning.CrossValidator(estimator=randomForestRegressor, evaluator=evaluator, numFolds=5)
cv_model = cv.fit(vsdf)
I get the error -
KeyError Traceback (most recent call last)
<ipython-input-44-d4e7a9d3602e> in <module>
----> 1 cv_model = cv.fit(vsdf)
C:\Spark\spark-3.1.2-bin-hadoop3.2\python\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\Spark\spark-3.1.2-bin-hadoop3.2\python\pyspark\ml\tuning.py in _fit(self, dataset)
667 def _fit(self, dataset):
668 est = self.getOrDefault(self.estimator)
--> 669 epm = self.getOrDefault(self.estimatorParamMaps)
670 numModels = len(epm)
671 eva = self.getOrDefault(self.evaluator)
C:\Spark\spark-3.1.2-bin-hadoop3.2\python\pyspark\ml\param\__init__.py in getOrDefault(self, param)
344 return self._paramMap[param]
345 else:
--> 346 return self._defaultParamMap[param]
347
348 def extractParamMap(self, extra=None):
KeyError: Param(parent='CrossValidator_a9121a59fda3', name='estimatorParamMaps', doc='estimator param maps')
I guess this is because I have not provided any parameter-map to search over. Is there no way to do cross-validation in spark-ml without a parameter grid?
Yes, you can do it. You need to pass an empty parameter grid though.
something like this should work, it will behave like a normal K-fold cross validator.
params_grid = ParamGridBuilder().build()

Python Jupyter Notebook scipy

For a long time I was able to add data and fit, then plot the curve with data. But recently I get this:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-6-6f645a2744bc> in <module>
1 poland = prepare_data(europe_data, 'Poland')
----> 2 plot_all(poland, max_y=400000)
3 poland
~/Pulpit/library.py in plot_all(country, max_x, max_y)
43 def plot_all(country, max_x = 1000, max_y = 500000):
44
---> 45 parameters_logistic = scipy.optimize.curve_fit(func_logistic, country['n'], country['all'])[0]
46 parameters_expo = scipy.optimize.curve_fit(func_expo, country['n'], country['all'])[0]
47
/usr/local/lib64/python3.6/site-packages/scipy/optimize/minpack.py in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, jac, **kwargs)
787 cost = np.sum(infodict['fvec'] ** 2)
788 if ier not in [1, 2, 3, 4]:
--> 789 raise RuntimeError("Optimal parameters not found: " + errmsg)
790 else:
791 # Rename maxfev (leastsq) to max_nfev (least_squares), if specified.
RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 800.
Here are all Python Jupyter Notebook files: https://files.fm/u/zj7cc6ne#sign_up
How to solve this?
scipy.optimize.curve_fit takes a keyword argument p0.
Initial guess for the parameters (length N). If None, then the initial
values will all be 1 (if the number of parameters for the function can
be determined using introspection, otherwise a ValueError is raised).
If the defaults 1 are too far of from the result the algorithm may not converge. Try to put some values that make sense for your problem.

Using pretrained models from sparknlp on Databricks

I am trying to follow the official examples from John Snow Labs but every time I get a TypeError: 'JavaPackage' object is not callable error. I followed all of the steps in the Databricks install documentation but no matter what walkthrough I try, either this one or this one it fails.
An example of the first (after doing the installs):
import sparknlp
from sparknlp.pretrained import *
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
recognize_entities_dl download started this may take some time.
TypeError: 'JavaPackage' object is not callable
TypeError Traceback (most recent call last)
<command-937510457011238> in <module>
----> 1 pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
2
3 # ner_bert = NerDLModel.pretrained('ner_dl_bert')
4
5 # pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
89 def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90 if not disk_location:
---> 91 self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92 else:
93 self.model = PipelineModel.load(disk_location)
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
49 def downloadPipeline(name, language, remote_loc=None):
50 print(name + " download started this may take some time.")
---> 51 file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
52 if file_size == "-1":
53 print("Can not find the model to download please check the name!")
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
190 def __init__(self, name, language, remote_loc):
191 super(_GetResourceSize, self).__init__(
--> 192 "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
193
194
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
127 super(ExtendedJavaWrapper, self).__init__(java_obj)
128 self.sc = SparkContext._active_spark_context
--> 129 self._java_obj = self.new_java_obj(java_obj, *args)
130 self.java_obj = self._java_obj
131
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
137
138 def new_java_obj(self, java_class, *args):
--> 139 return self._new_java_obj(java_class, *args)
140
141 def new_java_array(self, pylist, java_class):
/databricks/spark/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I get a similar if not the exact error if I try:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
I also get the same error for the second example. The Databricks Runtime Version is: 6.5 (includes Apache Spark 2.4.5, Scala 2.11), which is on the list of approved runtimes.
I'm not sure what the error messages mean or how to resolve them.
I found out that 'JavaPackage' object is not callable is caused by the spark-nlp (assembly jars) missing. So I made sure that these jars were downloaded and then placed in BOTH the executor and driver. E.g
when building the Spark docker image do something like
RUN cd /opt/spark/jars && \
wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-nlp-assembly-2.6.4.jar
and also on the driver image/machine make sure the jar exists in the local directoy. Then set
conf.set("spark.driver.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
conf.set("spark.executor.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
The solution for databricks might be a bit different so instead of baking in the jars you may need to host them on S3 and refer to them that way.

Autoencoder in fastai

I'm trying to build an autoencoder with fast.ai version 1.0.52 and struggling with how to set labels to be equal to original images. I was
following this blog post: https://alanbertl.com/autoencoder-with-fast-ai/
I replaced ImageItemList in the original code with ImageList since it was changed in the latest fastai versions.
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.vision import *
from fastai.data_block import *
from fastai.basic_train import *
import pandas as pd
x = np.random.randint(256, size=(1000, 16384))
x = x/255
x = x.reshape(-1,128,128)
x = np.stack([x,x,x],1)
x.shape
class ArraysImageList(ImageList,FloatList):
def __init__(self, items:Iterator, log:bool=False, **kwargs):
if isinstance(items, ItemList):
items = items.items
super(FloatList,self).__init__(items,**kwargs)
def get(self,i):
return Tensor(super(FloatList,self).get(i).astype('float32'))
x_il = ArraysImageList(x)
x_ils = x_il.split_by_rand_pct()
lls = x_ils.label_from_lists(x_ils.train, x_ils.valid)
Here's the error message I get.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-33-cbada9e18af9> in <module>
----> 1 lls = x_ils.label_from_lists(x_ils.train, x_ils.valid)
~/.local/lib/python3.6/site-packages/fastai/data_block.py in label_from_lists(self, train_labels, valid_labels, label_cls, **kwargs)
484 self.valid = self.valid._label_list(x=self.valid, y=self.train.y.new(valid_labels, **kwargs))
485 self.__class__ = LabelLists
--> 486 self.process()
487 return self
488
~/.local/lib/python3.6/site-packages/fastai/data_block.py in process(self)
520 "Process the inner datasets."
521 xp,yp = self.get_processors()
--> 522 for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
523 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
524 for ds in self.lists:
~/.local/lib/python3.6/site-packages/fastai/data_block.py in process(self, xp, yp, name)
683 def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):
684 "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
--> 685 self.y.process(yp)
686 if getattr(self.y, 'filter_missing_y', False):
687 filt = array([o is None for o in self.y.items])
~/.local/lib/python3.6/site-packages/fastai/data_block.py in process(self, processor)
73 if processor is not None: self.processor = processor
74 self.processor = listify(self.processor)
---> 75 for p in self.processor: p.process(self)
76 return self
77
~/.local/lib/python3.6/site-packages/fastai/data_block.py in process(self, ds)
334
335 def process(self, ds):
--> 336 if self.classes is None: self.create_classes(self.generate_classes(ds.items))
337 ds.classes = self.classes
338 ds.c2i = self.c2i
~/.local/lib/python3.6/site-packages/fastai/data_block.py in generate_classes(self, items)
391 for c in items: classes = classes.union(set(c))
392 classes = list(classes)
--> 393 classes.sort()
394 return classes
395
RuntimeError: bool value of Tensor with more than one value is ambiguous
Ultimately, I want to read images using a dataframe with image paths. So I also tried the following:
import sklearn
cv = sklearn.model_selection.GroupKFold(n_splits=5)
train_inds, valid_inds = next(cv.split(iso_image_df.group, groups=iso_image_df.group))
img_lists = (ImageList.from_df(iso_image_df, resized_img_path, cols=0).split_by_idxs(train_inds, valid_inds))
src = img_lists.label_from_lists(img_lists.train, img_lists.valid)
data = (src.databunch(bs = 32).normalize(imagenet_stats))
data.show_batch(rows=3, figsize=(10, 10))
Here I get the following error message:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-146-2514de511e64> in <module>
----> 1 data.show_batch(rows=3, figsize=(10, 10))
~/.local/lib/python3.6/site-packages/fastai/basic_data.py in show_batch(self, rows, ds_type, reverse, **kwargs)
190 #TODO: get rid of has_arg if possible
191 if has_arg(self.train_ds.y.reconstruct, 'x'):
--> 192 ys = [self.train_ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)]
193 else : ys = [self.train_ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)]
194 self.train_ds.x.show_xys(xs, ys, **kwargs)
~/.local/lib/python3.6/site-packages/fastai/basic_data.py in <listcomp>(.0)
190 #TODO: get rid of has_arg if possible
191 if has_arg(self.train_ds.y.reconstruct, 'x'):
--> 192 ys = [self.train_ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)]
193 else : ys = [self.train_ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)]
194 self.train_ds.x.show_xys(xs, ys, **kwargs)
~/.local/lib/python3.6/site-packages/fastai/data_block.py in reconstruct(self, t, x)
89 def reconstruct(self, t:Tensor, x:Tensor=None):
90 "Reconstruct one of the underlying item for its data `t`."
---> 91 return self[0].reconstruct(t,x) if has_arg(self[0].reconstruct, 'x') else self[0].reconstruct(t)
92
93 def new(self, items:Iterator, processor:PreProcessors=None, **kwargs)->'ItemList':
AttributeError: 'Image' object has no attribute 'reconstruct'
Any help is highly appreciated!
The lls are being used to create the databunch.
I've looked at it and given the API change in fastai libs I created the databunch without using the lls that were causing the error:
bs = 64
db = (ImageImageList.from_folder(mnist)
.split_by_folder()
.label_from_func(get_y_fn)
.databunch(bs=bs,num_workers=4))
EDIT: you'll need the get_y_fn; it is very simply defined
def get_y_fn(x): return x
the lls aren't used for anything else anyway
This should fix your problem, let me know if this worked for you.

AttributeError: 'NoneType' object has no attribute 'setCallSite' pyspark after indexedRowMatrix columnSimilarities()

I'm working on a code that was correctly executed with the dataframe before, but this time when I execute it, I get an error. (The only difference is that I used persist() on the dataframe this time.)
simMat = IndexedRMat.columnSimilarities()
executes correctly, but then this part:
columns = ['product1', 'product2', 'sim']
vals = simMat.entries.map(lambda e: (e.i, e.j, e.value)).collect()
dfsim = spark.createDataFrame(vals, columns)
generates this error:
AttributeErrorTraceback (most recent call last)
<ipython-input-100-11502084c71b> in <module>()
1 columns = ['product1', 'product2', 'sim']
----> 2 vals = simMat.entries.map(lambda e: (e.i, e.j, e.value)).collect()
3 dfsim = spark.createDataFrame(vals, columns)
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/rdd.pyc in collect(self)
806 to be small, as all the data is loaded into the driver's memory.
807 """
--> 808 with SCCallSiteSync(self.context) as css:
809 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
810 return list(_load_from_socket(port, self._jrdd_deserializer))
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/traceback_utils.pyc in __enter__(self)
70 def __enter__(self):
71 if SCCallSiteSync._spark_stack_depth == 0:
---> 72 self._context._jsc.setCallSite(self._call_site)
73 SCCallSiteSync._spark_stack_depth += 1
74
AttributeError: 'NoneType' object has no attribute 'setCallSite'
What does it mean? I'm new to spark and didn't find an explanation for this type of error..