How can use sqlContext in spark udf - pyspark

I'm trying to load a json file in spark UDF and use it to query something. What i need to do is to do use a column value (storeId) from a data frame and use it in
But, i get a pickle error. If I try coding without the sqlContext then it work.
Is there any workaround or is this not possible.
def get_id_udf (storeId,sqlContext):
df = sqlContext.read.json("file_url_s3")
if storeId == None:
return None
return None
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
desc_udf = udf(lambda storeId : get_id_udf(storeId,sqlContext), IntegerType())
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-22-b5c4070c110e> in <module>()
1 from pyspark.sql.functions import udf, col
2 from pyspark.sql.types import IntegerType
----> 3 desc_udf = udf(lambda storeId : get_cluster_id_udf(storeId,sqlContext), IntegerType())
/usr/lib/spark/python/pyspark/sql/functions.py in udf(f, returnType)
1799 [Row(slen=5), Row(slen=3)]
1800 """
-> 1801 return UserDefinedFunction(f, returnType)
1802
1803 blacklist = ['map', 'since', 'ignore_unicode_prefix']
/usr/lib/spark/python/pyspark/sql/functions.py in __init__(self, func, returnType, name)
1758 self.returnType = returnType
1759 self._broadcast = None
-> 1760 self._judf = self._create_judf(name)
1761
1762 def _create_judf(self, name):
/usr/lib/spark/python/pyspark/sql/functions.py in _create_judf(self, name)
1763 from pyspark.sql import SQLContext
1764 sc = SparkContext.getOrCreate()
-> 1765 wrapped_func = _wrap_function(sc, self.func, self.returnType)
1766 ctx = SQLContext.getOrCreate(sc)
1767 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())
/usr/lib/spark/python/pyspark/sql/functions.py in _wrap_function(sc, func, returnType)
1743 def _wrap_function(sc, func, returnType):
1744 command = (func, returnType)
-> 1745 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
1746 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
sc.pythonExec,
1747 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/usr/lib/spark/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command)
2313 # the serialized command will be compressed by broadcast
2314 ser = CloudPickleSerializer()
-> 2315 pickled_command = ser.dumps(command)
2316 if len(pickled_command) > (1 << 20): # 1M
2317 # The broadcast will have same life cycle as created PythonRDD
/usr/lib/spark/python/pyspark/serializers.py in dumps(self, obj)
426
427 def dumps(self, obj):
--> 428 return cloudpickle.dumps(obj, 2)
429
430
/usr/lib/spark/python/pyspark/cloudpickle.py in dumps(obj, protocol)
655
656 cp = CloudPickler(file,protocol)
--> 657 cp.dump(obj)
658
659 return file.getvalue()
/usr/lib/spark/python/pyspark/cloudpickle.py in dump(self, obj)
105 self.inject_addons()
106 try:
--> 107 return Pickler.dump(self, obj)
108 except RuntimeError as e:
109 if 'recursion' in e.args[0]:
/usr/lib64/python2.7/pickle.pyc in dump(self, obj)
222 if self.proto >= 2:
223 self.write(PROTO + chr(self.proto))
--> 224 self.save(obj)
225 self.write(STOP)
226
/usr/lib64/python2.7/pickle.pyc in save(self, obj)
284 f = self.dispatch.get(t)
285 if f:
--> 286 f(self, obj) # Call unbound method with explicit self
287 return
288
/usr/lib64/python2.7/pickle.pyc in save_dict(self, obj)
653
654 self.memoize(obj)
--> 655 self._batch_setitems(obj.iteritems())
656
657 dispatch[DictionaryType] = save_dict
/usr/lib64/python2.7/pickle.pyc in _batch_setitems(self, items)
685 for k, v in tmp:
686 save(k)
--> 687 save(v)
688 write(SETITEMS)
689 elif n:
/usr/lib64/python2.7/pickle.pyc in save(self, obj)
304 reduce = getattr(obj, "__reduce_ex__", None)
305 if reduce:
--> 306 rv = reduce(self.proto)
307 else:
308 reduce = getattr(obj, "__reduce__", None)
TypeError: 'JavaPackage' object is not callable

Related

How do I find the count of missing value in a pyspark data frame

I'm using pyspark 3.2.1. I'm trying to find missing value count in each of the column of my pyspark data frame.
So I used following code
dataColumns=['columns in my data frame']
df.select([count(when(isnan(c), c)).alias(c) for c in dataColumns]).show(truncate=False)
But I got error message
---------------------------------------------------------------------------
AnalysisException Traceback (most recent call last)
<ipython-input-56-6c7766e33c77> in <module>()
1 dataColumns=['myDate']
----> 2 df.select([count(when(isnan(c), c)).alias(c) for c in dataColumns]).show(truncate=False)
/usr/local/spark/python/pyspark/sql/dataframe.py in select(self, *cols)
1667 [Row(name='Alice', age=12), Row(name='Bob', age=15)]
1668 """
-> 1669 jdf = self._jdf.select(self._jcols(*cols))
1670 return DataFrame(jdf, self.sql_ctx)
1671
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 # Hide where the exception came from that shows a non-Pythonic
116 # JVM exception message.
--> 117 raise converted from None
118 else:
119 raise
AnalysisException: cannot resolve 'isnan(`myDate`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`myDate`' is of timestamp type.;
'Aggregate [count(CASE WHEN isnan(myDate#1994) THEN myDate END) AS myDate#5831]
Can you please help me to resolve this issue?

Error when loading pipelines in spaCy 3.0

After updating to spaCy 3.0.6 I haven't been able to load in either of the trained pipelines, although both seem to be properly installed:
================= Installed pipeline packages (spaCy v3.0.6) =================
ℹ spaCy installation:
/Users/baconbaker/anaconda3/envs/ml/lib/python3.8/site-packages/spacy
NAME SPACY VERSION
en_core_web_sm >=3.0.0,<3.1.0 3.0.0 ✔
en_core_web_trf >=3.0.0,<3.1.0 3.0.0 ✔
This occcurs when using spacy.load() and importing the pipelines as a module (error is identical for all of the following lines):
nlp = spacy.load("en_core_web_trf")
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
import en_core_web_trf
nlp = en_core_web_trf.load()
The error I'm getting is the following:
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-9-b38eb3aae320> in <module>
1 import en_core_web_trf
----> 2 nlp = en_core_web_trf.load()
~/anaconda3/envs/ml/lib/python3.8/site-packages/en_core_web_trf/__init__.py in load(**overrides)
8
9 def load(**overrides):
---> 10 return load_model_from_init_py(__file__, **overrides)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/util.py in load_model_from_init_py(init_file, vocab, disable, exclude, config)
514 if not model_path.exists():
515 raise IOError(Errors.E052.format(path=data_path))
--> 516 return load_model_from_path(
517 data_path,
518 vocab=vocab,
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/util.py in load_model_from_path(model_path, meta, vocab, disable, exclude, config)
389 config_path = model_path / "config.cfg"
390 config = load_config(config_path, overrides=dict_to_dot(config))
--> 391 nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
392 return nlp.from_disk(model_path, exclude=exclude)
393
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/util.py in load_model_from_config(config, vocab, disable, exclude, auto_fill, validate)
426 # registry, including custom subclasses provided via entry points
427 lang_cls = get_lang_class(nlp_config["lang"])
--> 428 nlp = lang_cls.from_config(
429 config,
430 vocab=vocab,
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/language.py in from_config(cls, config, vocab, disable, exclude, meta, auto_fill, validate)
1637 # then we would load them twice at runtime: once when we make from config,
1638 # and then again when we load from disk.
-> 1639 nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
1640 if after_creation is not None:
1641 nlp = after_creation(nlp)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/language.py in __init__(self, vocab, max_length, meta, create_tokenizer, batch_size, **kwargs)
148 # points. The factory decorator applied to these functions takes care
149 # of the rest.
--> 150 util.registry._entry_point_factories.get_all()
151
152 self._config = DEFAULT_CONFIG.merge(self.default_config)
~/anaconda3/envs/ml/lib/python3.8/site-packages/catalogue/__init__.py in get_all(self)
106 result = {}
107 if self.entry_points:
--> 108 result.update(self.get_entry_points())
109 for keys, value in REGISTRY.items():
110 if len(self.namespace) == len(keys) - 1 and all(
~/anaconda3/envs/ml/lib/python3.8/site-packages/catalogue/__init__.py in get_entry_points(self)
121 result = {}
122 for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
--> 123 result[entry_point.name] = entry_point.load()
124 return result
125
~/anaconda3/envs/ml/lib/python3.8/importlib/metadata.py in load(self)
75 """
76 match = self.pattern.match(self.value)
---> 77 module = import_module(match.group('module'))
78 attrs = filter(None, (match.group('attr') or '').split('.'))
79 return functools.reduce(getattr, attrs, module)
~/anaconda3/envs/ml/lib/python3.8/importlib/__init__.py in import_module(name, package)
125 break
126 level += 1
--> 127 return _bootstrap._gcd_import(name[level:], package, level)
128
129
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _gcd_import(name, package, level)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _find_and_load(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _call_with_frames_removed(f, *args, **kwds)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _gcd_import(name, package, level)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _find_and_load(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _load_unlocked(spec)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap_external.py in exec_module(self, module)
~/anaconda3/envs/ml/lib/python3.8/importlib/_bootstrap.py in _call_with_frames_removed(f, *args, **kwds)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/__init__.py in <module>
----> 1 from . import architectures
2 from . import annotation_setters
3 from . import span_getters
4 from .layers import TransformerModel
5 from .pipeline_component import Transformer, install_extensions
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/architectures.py in <module>
3 from thinc.types import Ragged, Floats2d
4 from spacy.tokens import Doc
----> 5 from .layers import TransformerModel, TransformerListener
6 from .layers import trfs2arrays, split_trf_batch
7 from .util import registry
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/layers/__init__.py in <module>
----> 1 from .listener import TransformerListener
2 from .transformer_model import TransformerModel
3 from .split_trf import split_trf_batch
4 from .trfs2arrays import trfs2arrays
5
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/layers/listener.py in <module>
2 from thinc.api import Model
3 from spacy.tokens import Doc
----> 4 from ..data_classes import TransformerData
5
6
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/data_classes.py in <module>
9 import srsly
10
---> 11 from .util import transpose_list
12 from .align import get_token_positions
13
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/util.py in <module>
2 from pathlib import Path
3 import random
----> 4 from transformers import AutoModel, AutoTokenizer
5 from transformers.tokenization_utils import BatchEncoding
6 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
~/anaconda3/envs/ml/lib/python3.8/site-packages/transformers/__init__.py in <module>
624
625 # Trainer
--> 626 from .trainer import Trainer
627 from .trainer_pt_utils import torch_distributed_zero_first
628 else:
~/anaconda3/envs/ml/lib/python3.8/site-packages/transformers/trainer.py in <module>
67 TrainerState,
68 )
---> 69 from .trainer_pt_utils import (
70 DistributedTensorGatherer,
71 SequentialDistributedSampler,
~/anaconda3/envs/ml/lib/python3.8/site-packages/transformers/trainer_pt_utils.py in <module>
38 SAVE_STATE_WARNING = ""
39 else:
---> 40 from torch.optim.lr_scheduler import SAVE_STATE_WARNING
41
42 logger = logging.get_logger(__name__)
ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (/Users/baconbaker/anaconda3/envs/ml/lib/python3.8/site-packages/torch/optim/lr_scheduler.py)
Reverting to torch 1.4.0 from the current stable release 1.8.1 solves the problem, but I don't want to do so.
Is there an alternative solution?
It looks like this is fixed in newer versions of transformers (https://github.com/huggingface/transformers/pull/8979). Try upgrading both transformers and spacy-transformers.

Constraints in scipy.optimize throwing x0 error

Looking to take a list of stocks and adjust their weight in a portfolio until the overall portfolio beta is 1.0 the output of "stonkBetas" is static and is:
[3.19292010501853,
0.7472001935364129,
1.0889157697158605,
0.8944059912707691,
0.04192080860817828,
1.0011520737327186,
0.9155119223385676]
I then create two functions. One to define how the betas are weighted. The second just as con that will make the constraint that the sum of the minimized weighted portfolio will have an overall summed beta of 1.0.
def betaOpp(weights):
a,b,c,d,e,f,g=weights
f=a*stonkBetas[0]+b*stonkBetas[1]+c*stonkBetas[2]+d*stonkBetas[3]+e*stonkBetas[4]+f*stonkBetas[5]+g*stonkBetas[6]
return f
initial_guess = [.1,.1,.1,.1,.2,.2,.2]
print('hi')
print(sum(initial_guess))
print('bye')
def con(t):
print('this should be zero:')
print(sum(t)-1)
return sum(t) - 1.0
cons = {'type':'eq', 'fun': con}
bnds = ((.02,.8),(.02,.8),(.02,.8),(.02,.8),(.02,.8),(.02,.8),(.02,.8))
res = optimize.minimize(betaOpp,initial_guess, bounds=bnds, constraints=cons)
print(res)
This gives me this output
hi
1.0
bye
this should be zero:
0.0
this should be zero:
0.0
this should be zero:
0.0
this should be zero:
1.4901161193847656e-08
this should be zero:
1.4901161193847656e-08
this should be zero:
1.4901161193847656e-08
this should be zero:
1.4901161193847656e-08
this should be zero:
1.4901161193847656e-08
this should be zero:
1.4901161193847656e-08
this should be zero:
1.4901161193847656e-08
this should be zero:
6.661338147750939e-16
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-30-6567109e94a4> in <module>
16 cons = {'type':'eq', 'fun': con}
17 bnds = ((.02,.8),(.02,.8),(.02,.8),(.02,.8),(.02,.8),(.02,.8),(.02,.8))
---> 18 res = optimize.minimize(betaOpp,x0=initial_guess, bounds=bnds, constraints=cons)
19 print(res)
/opt/miniconda3/lib/python3.6/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
624 elif meth == 'slsqp':
625 return _minimize_slsqp(fun, x0, args, jac, bounds,
--> 626 constraints, callback=callback, **options)
627 elif meth == 'trust-constr':
628 return _minimize_trustregion_constr(fun, x0, args, jac, hess, hessp,
/opt/miniconda3/lib/python3.6/site-packages/scipy/optimize/slsqp.py in _minimize_slsqp(func, x0, args, jac, bounds, constraints, maxiter, ftol, iprint, disp, eps, callback, finite_diff_rel_step, **unknown_options)
424
425 if mode == -1: # gradient evaluation required
--> 426 g = append(sf.grad(x), 0.0)
427 a = _eval_con_normals(x, cons, la, n, m, meq, mieq)
428
/opt/miniconda3/lib/python3.6/site-packages/scipy/optimize/_differentiable_functions.py in grad(self, x)
186 if not np.array_equal(x, self.x):
187 self._update_x_impl(x)
--> 188 self._update_grad()
189 return self.g
190
/opt/miniconda3/lib/python3.6/site-packages/scipy/optimize/_differentiable_functions.py in _update_grad(self)
169 def _update_grad(self):
170 if not self.g_updated:
--> 171 self._update_grad_impl()
172 self.g_updated = True
173
/opt/miniconda3/lib/python3.6/site-packages/scipy/optimize/_differentiable_functions.py in update_grad()
90 self.ngev += 1
91 self.g = approx_derivative(fun_wrapped, self.x, f0=self.f,
---> 92 **finite_diff_options)
93
94 self._update_grad_impl = update_grad
/opt/miniconda3/lib/python3.6/site-packages/scipy/optimize/_numdiff.py in approx_derivative(fun, x0, method, rel_step, abs_step, f0, bounds, sparsity, as_linear_operator, args, kwargs)
389
390 if np.any((x0 < lb) | (x0 > ub)):
--> 391 raise ValueError("`x0` violates bound constraints.")
392
393 if as_linear_operator:
ValueError: `x0` violates bound constraints.
And I just don't understand where I'm going wrong. The x0 is perfectly 1.0 - I can see it! Hopefully I'm just doing something stupid here. Please help!

networkx maximum_flow crashes on some pairs of nodes

I have a graph composed of 742 edges, and 360 nodes.
I want to compute max flow between some pairs of nodes and it happens, for some of them the nx.maximum_flow ends with the pasted error, despite the fact that a path exists between the two concerned nodes.
Any idea what causes that?
Thanks.
ValueError Traceback (most recent call last)
<ipython-input-186-6dae3501e3d0> in <module>()
1 #print(nx.shortest_path(G,source="Sink_0",target="node_32"))
----> 2 nx.maximum_flow(G, "Sink_0", "Aircraft2_32")
/Library/Python/2.7/site-packages/networkx/algorithms/flow/maxflow.pyc in maximum_flow(G, s, t, capacity, flow_func, **kwargs)
156 raise nx.NetworkXError("flow_func has to be callable.")
157
--> 158 R = flow_func(G, s, t, capacity=capacity, value_only=False, **kwargs)
159 flow_dict = build_flow_dict(G, R)
160
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in preflow_push(G, s, t, capacity, residual, global_relabel_freq, value_only)
420 """
421 R = preflow_push_impl(G, s, t, capacity, residual, global_relabel_freq,
--> 422 value_only)
423 R.graph['algorithm'] = 'preflow_push'
424 return R
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in preflow_push_impl(G, s, t, capacity, residual, global_relabel_freq, value_only)
279 break
280 u = next(iter(level.active))
--> 281 height = discharge(u, False)
282 if grt.is_reached():
283 # Global relabeling heuristic.
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in discharge(u, is_phase1)
156 # We have run off the end of the adjacency list, and there can
157 # be no more admissible edges. Relabel the node to create one.
--> 158 height = relabel(u)
159 if is_phase1 and height >= n - 1:
160 # Although the node is still active, with a height at least
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in relabel(u)
125 """
126 grt.add_work(len(R_succ[u]))
--> 127 return min(R_node[v]['height'] for v, attr in R_succ[u].items()
128 if attr['flow'] < attr['capacity']) + 1
129
ValueError: min() arg is an empty sequence

Applying scipy.sparse.linalg.svds throws a Memory Error?

I try to decompose a sparse matrix(40,000×1,400,000) with scipy.sparse.linalg.svds on my 64-bit machine with 140GB RAM. as following:
k = 5000
tfidf_mtx = tfidf_m.tocsr()
u_45,s_45,vT_45 = scipy.sparse.linalg.svds(tfidf_mtx, k=k)
When the K ranges from 1000 to 4500, it works. But the K is 5000, it throws an MemoryError.The precise error is given below:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-6-31a69ce54e2c> in <module>()
4 k = 4000
5 tfidf_mtx = tfidf_m.tocsr()
----> 6 get_ipython().magic(u'time u_50,s_50,vT_50 =linalg.svds(tfidf_mtx, k=k))
7 # print len(s),s
8
/usr/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2163 magic_name, _, magic_arg_s = arg_s.partition(' ')
2164 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2165 return self.run_line_magic(magic_name, magic_arg_s)
2166
2167 #-------------------------------------------------------------------------
/usr/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2084 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2085 with self.builtin_trap:
-> 2086 result = fn(*args,**kwargs)
2087 return result
2088
/usr/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
/usr/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
189 # but it's overkill for just that one bit of state.
190 def magic_deco(arg):
--> 191 call = lambda f, *a, **k: f(*a, **k)
192
193 if callable(arg):
/usr/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1043 else:
1044 st = clock2()
-> 1045 exec code in glob, local_ns
1046 end = clock2()
1047 out = None
<timed exec> in <module>()
/usr/local/lib/python2.7/dist-packages/scipy/sparse/linalg/eigen/arpack/arpack.pyc in svds(A, k, ncv, tol, which, v0, maxiter, return_singular_vectors)
1751 else:
1752 ularge = eigvec[:, above_cutoff]
-> 1753 vhlarge = _herm(X_matmat(ularge) / slarge)
1754
1755 u = _augmented_orthonormal_cols(ularge, nsmall)
/usr/local/lib/python2.7/dist-packages/scipy/sparse/base.pyc in dot(self, other)
244
245 """
--> 246 return self * other
247
248 def __eq__(self, other):
/usr/local/lib/python2.7/dist-packages/scipy/sparse/base.pyc in __mul__(self, other)
298 return self._mul_vector(other.ravel()).reshape(M, 1)
299 elif other.ndim == 2 and other.shape[0] == N:
--> 300 return self._mul_multivector(other)
301
302 if isscalarlike(other):
/usr/local/lib/python2.7/dist-packages/scipy/sparse/compressed.pyc in _mul_multivector(self, other)
463
464 result = np.zeros((M,n_vecs), dtype=upcast_char(self.dtype.char,
--> 465 other.dtype.char))
466
467 # csr_matvecs or csc_matvecs
MemoryError:
The when the k is 3000 and 4500, the ratio of the sum of the square of singular values to the sum of the square of all matrix entities is respectively 0.7033 and 0.8230. I am searching for a long time on net. But no use. Please help or try to give some ideas how to achieve this.
So the return is an (M,k) array. On an ordinary older machine:
In [368]: np.ones((40000,1000))
....
In [369]: np.ones((40000,4000))
...
In [370]: np.ones((40000,5000))
...
--> 190 a = empty(shape, dtype, order)
191 multiarray.copyto(a, 1, casting='unsafe')
192 return a
MemoryError:
Now may just be a coincidence that I hit the memory error at the same size are your code. But if you make the problem big enough you will hit memory errors at some point.
Your stacktrace shows the error occurs while multiplying a sparse matrix and a dense 2d array (other), and the result will be dense as well.