getting __init__() got an unexpected keyword argument 'outputCols' when doing the oneHotEncoder - pyspark

stages = []
for categoricalCol in categoricalColumns:
stringIndexer = StringIndexer(
inputCol=categoricalCol, outputCol=categoricalCol + "Index"
)
encoder = OneHotEncoder(
inputCols=[stringIndexer.getOutputCol()],
outputCols=[categoricalCol + "classVec"],
)
stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol="Status", outputCol="label")
stages += [label_stringIdx]
assemblerInputs = [c + "classVsc" for c in categoricalColumns] + numericColumns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
> TypeError
> Traceback (most recent call last) <ipython-input-18-7156eaeec61b> in <module>
> 2 for categoricalCol in categoricalColumns:
> 3 stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
> ----> 4 encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
> 5 stages += [stringIndexer, encoder]
> 6 label_stringIdx = StringIndexer(inputCol = 'Status', outputCol = 'label')
>
> /usr/local/spark/python/pyspark/__init__.py in wrapper(self, *args, **kwargs)
> 102 raise TypeError("Method %s forces keyword arguments." % func.__name__)
> 103 self._input_kwargs = kwargs
> --> 104 return func(self, **kwargs)
> 105 return wrapper
> 106
>
> TypeError: __init__() got an unexpected keyword argument 'outputCols'

Check the documentation:
https://spark.apache.org/docs/2.2.2/api/python/pyspark.ml.html?highlight=onehotencoder#pyspark.ml.feature.OneHotEncoder
The param is outputCol

Related

SciPy opitimize 'ValueError: setting an array element with a sequence.' ARIMA models

I know that the issue 'ValueError: setting an array element with a sequence.' is normally because the function being optimized is a vector and not a scalar, anyhow my ARMA model below still gets this issue. Here is the code
def loghood(parm,endog,exog,p,q):
arparams,maparams,exogparams,bias = parm
armapredicts=np.zeros(endog.shape[0])
bias=0
res=abs(endog - np.mean(endog))
if p==0:
for i in range(1,endog.shape[0]-p):
armapredicts[i] = np.array([[res[i-f+q]] for f in range(0,q)]).dot(maparams.T) + exog.iloc[i,:].dot(exogparams.T) + bias
if q==0:
for i in range(1,endog.shape[0]-q):
armapredicts[i] = np.array([ [endog[i-f+p]] for f in range(0,p)]).T.dot(arparams) + exog.iloc[i,:].T.dot(exogparams) + bias
else:
for i in range(1,endog.shape[0]-2):
armapredicts[i] = np.array([ [endog[i-f+p]] for f in range(0,p)]).T.dot(arparams.reshape(-1,1)) + np.array([[res[i-f+q]] for f in range(0,q)]).T.dot(maparams.reshape(-1,1)) + exog.iloc[i,:].T.dot(exogparams.reshape(-1,1)) + bias
print(np.array([ [endog[i-f+p]] for f in range(0,p)]).T.shape )
print(maparams.reshape(-1,1).shape)
liklihood=1/((2*np.pi*armapredicts)**(1/2))*np.exp(-res**2/(2*armapredicts**2))
print(liklihood.shape)
log_hood=np.sum(np.log(liklihood.values))
print(log_hood)
return log_hood
x0=[np.ones(2),np.ones(2),np.ones(returnsant.shape[1]-1),1]
x0=np.array(x0,dtype=object).flatten()
res = spop.minimize(loghood,x0 ,args=(returnsant['Hedge Fund'],returnsant.drop('Hedge Fund',axis= 1),2,2), method='Nelder-mead')
print(res)
I know that likelihood is a 1-d vector and the log_hood is certainly scalar after np.sum, so how is this error occurring?? Thank you for your time.
EDIT:forgot to include the full error message
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
/var/folders/7f/gmpqnwqx0lb4nrsz2vqhvgn40000gp/T/ipykernel_30671/32993526.py in <module>
1 x0=[np.ones(2),np.ones(2),np.ones(returnsant.shape[1]-1),1]
2 #x0=np.array(x0,dtype=object).flatten()
----> 3 res = spop.minimize(loghood,x0 ,args=(returnsant['Hedge Fund'],returnsant.drop('Hedge Fund',axis= 1),2,2), method='Nelder-mead')
4 print(res)
~/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
609
610 if meth == 'nelder-mead':
--> 611 return _minimize_neldermead(fun, x0, args, callback, bounds=bounds,
612 **options)
613 elif meth == 'powell':
~/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/optimize.py in _minimize_neldermead(func, x0, args, callback, maxiter, maxfev, disp, return_all, initial_simplex, xatol, fatol, adaptive, bounds, **unknown_options)
687 zdelt = 0.00025
688
--> 689 x0 = asfarray(x0).flatten()
690
691 if bounds is not None:
~/opt/anaconda3/lib/python3.9/site-packages/numpy/core/overrides.py in asfarray(*args, **kwargs)
~/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/type_check.py in asfarray(a, dtype)
112 if not _nx.issubdtype(dtype, _nx.inexact):
113 dtype = _nx.float_
--> 114 return asarray(a, dtype=dtype)
115
116
ValueError: setting an array element with a sequence.
FINAL EDIT:
I resolved the issue by simply creating a big array x0 of all my parameters
x0=np.zeros(5+returnsant.shape[1]-1)
res = spop.minimize(loghood,x0,args=(returnsant['Hedge Fund'],returnsant.drop('Hedge Fund',axis= 1),0,2), method='Nelder-mead')
print(res)
now I declare my subparameters in loghood
def loghood(parms,endog,exog,p,q):
arparams,maparams,exogparams,bias = parms[0:p],parms[p:p+q],parms[p+q:p+q+exog.shape[1]],parms[p+q+exog.shape[1]:-1]
armapredicts=np.zeros(endog.shape[0])
bias=0
res=abs(endog - np.mean(endog))
if p==0:
for i in range(1,endog.shape[0]-q):
armapredicts[i] = np.array([[res[i-f+q]] for f in range(0,q)]).T.dot(maparams) + exog.iloc[i,:].T.dot(exogparams) + bias
if q==0:
for i in range(1,endog.shape[0]-p):
armapredicts[i] = np.array([ [endog[i-f+p]] for f in range(0,p)]).T.dot(arparams) + exog.iloc[i,:].T.dot(exogparams) + bias
else:
for i in range(1,endog.shape[0]-2):
armapredicts[i] = np.array([ [endog[i-f+p]] for f in range(0,p)]).T.dot(arparams.reshape(-1,1)) + np.array([[res[i-f+q]] for f in range(0,q)]).T.dot(maparams.reshape(-1,1)) + exog.iloc[i,:].T.dot(exogparams.reshape(-1,1)) + bias
liklihood=1/((2*np.pi*armapredicts)**(1/2))*np.exp(-res**2/(2*armapredicts**2))
log_hood=np.sum(-np.log(liklihood.squeeze()))
return log_hood

to pass a variable from a method to the main body

I made a code to compare 2 times . The structure of the code constitutes of a main body getting the 2 times. A class and 2 methods are converting the times into second just to make an integer. The problem is that the variable showing the integer doesn't pass to the main body. The code is as follow.
class Time():
def __init__(self, other=None):
self.other = other
def comparison(self, other):
self.other = other
return other > self
def time_to_int(self, other):
self.other = other
other = self.hour * 3600 + self.minute * 60 + self.second
print( other )
start = Time()
start.hour = 2.0
start.minute = 87
start.second = 98
start_time = Time()
start.time_to_int( start_time )
end = Time()
end.hour = 3.0
end.minute = 87
end.second = 98
end_time = Time()
end.time_to_int( end_time )
print( start_time, end_time )
start_time.comparison( end_time )
The result is
12518.0
16118.0
<__main__.Time object at 0x7f9ca1854110> <__main__.Time object at 0x7f9ca18541d0>
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-dc5298ddf4b1> in <module>()
31
32
---> 33 start_time.comparison(end_time)
34
<ipython-input-1-dc5298ddf4b1> in comparison(self, other)
5 def comparison(self, other):
6 self.other=other
----> 7 return other > self
8 def time_to_int(self, other):
9 self.other=other
TypeError: '>' not supported between instances of 'Time' and 'Time'
Your comparison method uses other > self, and the TypeError tells you that this is not defined. What's your question?

ValueError: 'attacks' is not in list

import json
i = 1
file_name = 'PolitiFact_Fake_' + str(i) + '-Webpage.json'
with open(file_name, 'r') as fp:
obj = json.load(fp)
text = obj['text']
length = len(text)
wordlist = text.split()
wordfreq = []
for w in wordlist:
wordfreq.append(wordlist.count(w))
lengthslova = len(wordlist)
wordfind = 'in'
indexword = wordlist.index(wordfind)
indexfreq = wordfreq[indexword]
findword = [wordfreq, wordlist]
findwordt = [[row[j] for row in findword] for j in range(lengthslova)]
wordfind = "attacks"
indexfreq = 0
if indexword != ValueError:
indexword = wordlist.index(wordfind)
indexfreq = wordfreq[indexword]
findword = [wordfind, indexfreq]
indexfreq = wordfreq[indexword]
findword = [wordfind, indexfreq]
print('The freq of word ' + str(wordfind) + ':', indexfreq)
else:
indexfreq = 0
findword = [wordfind, indexfreq]
print('The freq of word ' + str(wordfind) + ':', indexfreq)
I keep receiving this error:
ValueError: 'attacks' is not in list

matplotlib.dates loadtxt converters date conversion python 2

i am using juypter notebook and python 2.using matplotlib.dates I am unable to access the date
def bytespdate2num(fmt,encoding = 'utf-8'):
strconverter = mdates.strpdate2num(fmt)
def bytesconverter(b):
s = b.decode(encoding)
return strconverter(s)
return bytesconverter
Date,Open,High,Low,Close,Adjusted_close,Volume = np.loadtxt('C:\Users\harsh\Desktop\stock.txt', delimiter=',', unpack=True, converters ={0: bytespdate2num('%Y%m%d')})def bytespdate2num(fmt,encoding = 'utf-8'):
strconverter = mdates.strpdate2num(fmt)
def bytesconverter(b):
s = b.decode(encoding)
return strconverter(s)
return bytesconverter
Date,Open,High,Low,Close,Adjusted_close,Volume = np.loadtxt('C:\Users\harsh\Desktop\stock.txt', delimiter=',', unpack=True, converters ={0: bytespdate2num('%Y%m%d')})
the error message is:
> ValueErrorTraceback (most recent call last) <ipython-input-19-f69b316ac453> in <module>()
> 5 return strconverter(s)
> 6 return bytesconverter
> ----> 7 Date,Open,High,Low,Close,Adjusted_close,Volume = np.loadtxt('C:\Users\harsh\Desktop\stock.txt', delimiter=',',
> unpack=True, converters ={0: bytespdate2num('%Y%m%d')})
>
> C:\Users\harsh\Anaconda2\lib\site-packages\numpy\lib\npyio.pyc in
> loadtxt(fname, dtype, comments, delimiter, converters, skiprows,
> usecols, unpack, ndmin) 1022 1023 # Convert each
> value according to its column and store
> -> 1024 items = [conv(val) for (conv, val) in zip(converters, vals)] 1025 # Then pack it according to
> the dtype's nesting 1026 items = pack_items(items,
> packing)
>
> <ipython-input-19-f69b316ac453> in bytesconverter(b)
> 3 def bytesconverter(b):
> 4 s = b.decode(encoding)
> ----> 5 return strconverter(s)
> 6 return bytesconverter
> 7 Date,Open,High,Low,Close,Adjusted_close,Volume = np.loadtxt('C:\Users\harsh\Desktop\stock.txt', delimiter=',',
> unpack=True, converters ={0: bytespdate2num('%Y%m%d')})
>
> C:\Users\harsh\Anaconda2\lib\site-packages\matplotlib\dates.pyc in
> __call__(self, s)
> 287 return value: a date2num float
> 288 """
> --> 289 return date2num(datetime.datetime(*time.strptime(s, self.fmt)[:6]))
> 290
> 291
>
> C:\Users\harsh\Anaconda2\lib\_strptime.pyc in
> _strptime_time(data_string, format)
> 476
> 477 def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"):
> --> 478 return _strptime(data_string, format)[0]
>
> C:\Users\harsh\Anaconda2\lib\_strptime.pyc in _strptime(data_string,
> format)
> 330 if not found:
> 331 raise ValueError("time data %r does not match format %r" %
> --> 332 (data_string, format))
> 333 if len(data_string) != found.end():
> 334 raise ValueError("unconverted data remains: %s" %
>
>
> ValueError: time data u'2017-07-26' does not match format '%Y%m%d'

Cryptic TypeError: 'decimal.Decimal' object cannot be interpreted as an integer

I am struggling to understand why this function apparently fails in the Jupyter Notebook, but not in the IPython shell:
def present_value( r, n, fv = None, pmt = None ):
'''
Function to compute the Present Value based on interest rate and
a given future value.
Arguments accepted
------------------
* r = interest rate,
which should be given in its original percentage, eg.
5% instead of 0.05
* n = number of periods for which the cash flow,
either as annuity or single flow from one present value
* fv = future value in dollars,
if problem is annuity based, leave this empty
* pmt = each annuity payment in dollars,
if problem is single cash flow based, leave this empty
'''
original_args = [r, n, fv, pmt]
dec_args = [Decimal( arg ) if arg != None
else arg
for arg in original_args
]
if dec_args[3] == None:
return dec_args[2] / ( ( 1 + ( dec_args[0] / 100 ) )**dec_args[1] )
elif dec_args[2] == None:
# annuity_length = range( 1, dec_args[1] + 1 )
# Not allowed to add a Decimal object
# with an integer and to use it
# in the range() function,
# so we dereference the integer from original_args
annuity_length = range( 1, original_args[1] + 1 )
# Apply discounting to each annuity payment made
# according to number of years left till end
all_compounded_pmt = [dec_args[3] * ( 1 / ( ( 1 + dec_args[0] / 100 ) ** time_left ) ) \
for time_left in annuity_length
]
return sum( all_compounded_pmt )
When I imported the module that this function resides in, named functions.py, using from functions import *, and then executed present_value(r=7, n=35, pmt = 11000), I got the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-93-c1cc587f7e27> in <module>()
----> 1 present_value(r=7, n=35, pmt = 11000)
/path_to_file/functions.py in present_value(r, n, fv, pmt)
73 if dec_args[3] == None:
74 return dec_args[2]/((1 + (dec_args[0]/100))**dec_args[1])
---> 75
76 elif dec_args[2] == None:
77 # annuity_length = range(1, dec_args[1]+1)
TypeError: 'decimal.Decimal' object cannot be interpreted as an integer
but in the IPython shell, evaluating this function it works perfectly fine:
In [42]: functions.present_value(r=7, n=35, pmt = 11000)
Out[42]: Decimal('142424.39530474029537')
Can anyone please help me with this really confusing and obscure issue?