'_InvalidUnpickledPCollection' object has no attribute 'windowing' - apache-beam

I am trying to create a pipeline with Apache Beam, where the first step is to center the input time series around 0 by taking the average of the input PCollection, then subtracting each element from the average with a Map. However, running the below script gives me the following error:
'_InvalidUnpickledPCollection' object has no attribute 'windowing'
import apache_beam as beam
import numpy as np
from apache_beam.testing.test_pipeline import TestPipeline
raw_input = np.array(range(1024), dtype=float) # time series is made up of floats
def run_test():
with TestPipeline() as test_pl:
input = test_pl | "Create" >> beam.Create(raw_input)
avg = input | "Average" >> beam.CombineGlobally(beam.combiners.MeanCombineFn())
centered = input | "Center" >> beam.Map(lambda x: x - beam.pvalue.AsSingleton(avg))
test_pl.run()
run_test()

Writing your lambda as a dedicated mapping function solves your error message, however, for reasons unknown to me the output seems to be duplicated (at least when I test this on play.beam.apache.org)
import apache_beam as beam
import numpy as np
from apache_beam.testing.test_pipeline import TestPipeline
def centering(element, average):
return element - average
raw_input = np.array(range(1024), dtype=float) # time series is made up of floats
def run_test():
with TestPipeline() as test_pl:
input = test_pl | "Create" >> beam.Create(raw_input)
avg = input | "Average" >> beam.CombineGlobally(beam.combiners.MeanCombineFn())
centered = input | "Center" >> beam.Map(centering, beam.pvalue.AsSingleton(avg))
centered | beam.Map(print)
test_pl.run()
run_test()
As gorilla_glue pointed out, the reason being that we need to provide an explicit side-input variable (either by providing a method as in my code above or within the lambda, i.e., lambda x, side_input: ...).

Related

Convert string data in HDF5 File to float Format

I need to convert String data from a HDF5 File to Float format to use in a Skyplot (Astropy) with l b coordinates. The data is present here:
https://wwwmpa.mpa-garching.mpg.de/~ensslin/research/data/faraday2020.html
(Faraday Sky 2020)
The code I have programmed until now is:
from astropy import units as u
from astropy.coordinates import SkyCoord
import matplotlib.pyplot as plt
import numpy as np
import h5py
dat = []
ggl=[]
ggb=[]
f1= h5py.File('/home/nikita/faraday_2020/faraday2020.hdf5','r')
data = f1.get('faraday_sky_mean')
faraday_sky_mean = np.array(data)
data1 = f1.get('faraday_sky_std')
faraday_sky_std = np.array(data1)
n1 = 0
for line in f1:
s = line.split()
dat.append(s)
n1 = n1 +1
#
for i in range(0,n1):
ggl.append(float(dat[i][0])) # galactic coordinates input
ggb.append(float(dat[i][1]))
f1.close()
However I am getting the error:
ggl.append(float(dat[i][0])) # galactic coordinates input
ValueError: could not convert string to float: 'faraday_sky_mean'
Please help with this. Thanks.
What what you asked and what (I think) you need are 2 different things.
This line is NOT the way to read a HDF5 file: for line in f1:
You need to use a HDF5 API to read it (h5py is 1 of many).
I think you want to read datasets faraday_sky_mean and faraday_sky_std and load arrays into lists ggl and ggb. To do that, use this code. It will create 2 lists with 3145728 float64 values in each.
with h5py.File('faraday2020.hdf5','r') as hdf:
print(list(hdf.keys()))
faraday_sky_mean = hdf['faraday_sky_mean'][:]
faraday_sky_std = hdf['faraday_sky_std'][:]
print(faraday_sky_mean.shape, faraday_sky_mean.dtype)
print(f'Max Mean={max(faraday_sky_mean)}, Min Mean={min(faraday_sky_mean)}')
print(faraday_sky_std.shape, faraday_sky_std.dtype)
print(f'Max StdDev={max(faraday_sky_std)}, Min StdDev={min(faraday_sky_std)}')
ggl = faraday_sky_mean.tolist()
print(len(ggl),type(ggl[0]))
ggb = faraday_sky_std.tolist()
print(len(ggb),type(ggb[0]))
The procedure above saves the data as both NumPy arrays and Python lists. If you only need the lists (don't need the arrays), you can shorten the code as shown below:
with h5py.File('faraday2020.hdf5','r') as hdf:
ggl = hdf['faraday_sky_mean'][:].tolist()
print(len(ggl),type(ggl[0]))
ggb = hdf['faraday_sky_std'][:].tolist()
print(len(ggb),type(ggb[0]))

How to write a flexible multiple exponential fit

I'd like to write a more or less universial fit function for general function
$f_i = \sum_i a_i exp(-t/tau_i)$
for some data I have.
Below is an example code for a biexponential function but I would like to be able to fit a monoexponential or a triexponential function with the smallest code adaptions possible.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
t = np.linspace(0, 10, 100)
a_1 = 1
a_2 = 1
tau_1 = 5
tau_2 = 1
data = 1*np.exp(-t/5) + 1*np.exp(-t/1)
data += 0.2 * np.random.normal(size=t.size)
def func(t, a_1, tau_1, a_2, tau_2): # plus more exponential functions
return a_1*np.exp(-t/tau_1)+a_2*np.exp(-t/tau_2)
popt, pcov = curve_fit(func, t, data)
print(popt)
plt.plot(t, data, label="data")
plt.plot(t, func(t, *popt), label="fit")
plt.legend()
plt.show()
In principle I thought of redefining the function to a general form
def func(t, a, tau): # with a and tau as a list
tmp = 0
tmp += a[i]*np.exp(-t/tau[i])
return tmp
and passing the arguments to curve_fit in the form of lists or tuples. However I get a TypeError as shown below.
TypeError: func() takes 4 positional arguments but 7 were given
Is there anyway to rewrite the code that you can only by the input parameters of curve_fit "determine" the degree of the multiexponential function? So that passing
a = (1)
results in a monoexponential function whereas passing
a = (1, 2, 3)
results in a triexponential function?
Regards
Yes, that can be done easily with np.broadcasting:
def func(t, a, taus): # plus more exponential functions
a=np.array(a)[:,None]
taus=np.array(taus)[:,None]
return (a*np.exp(-t/taus)).sum(axis=0)
func accepts 2 lists, converts them into 2-dim np.array, computes a matrix with all the exponentials and then sums it up. Example:
t=np.arange(100).astype(float)
out=func(t,[1,2],[0.3,4])
plt.plot(out)
Keep in mind a and taus must be the same length, so sanitize your inputs as you see fit. Or you could also directly pass np.arrays instead of lists.

Matrix Multiplication A^T * A in PySpark

I asked a similar question yesterday - Matrix Multiplication between two RDD[Array[Double]] in Spark - however I've decided to shift to pyspark to do this. I've made some progress loading and reformatting the data - Pyspark map from RDD of strings to RDD of list of doubles - however the matrix multiplcation is difficult. Let me share my progress first:
matrix1.txt
1.2 3.4 2.3
2.3 1.1 1.5
3.3 1.8 4.5
5.3 2.2 4.5
9.3 8.1 0.3
4.5 4.3 2.1
it's difficult to share files, however this is what my matrix1.txt file looks like. It is a space-delimited text file including the values of a matrix. Next is the code:
# do the imports for pyspark and numpy
from pyspark import SparkConf, SparkContext
import numpy as np
# loadmatrix is a helper function used to read matrix1.txt and format
# from RDD of strings to RDD of list of floats
def loadmatrix(sc):
data = sc.textFile("matrix1.txt").map(lambda line: line.split(' ')).map(lambda line: [float(x) for x in line])
return(data)
# this is the function I am struggling with, it should take a line of the
# matrix (formatted as list of floats), compute an outer product with itself
def AtransposeA(line):
# pseudocode for this would be...
# outerprod = compute line * line^transpose
# return(outerprod)
# here is the main body of my file
if __name__ == "__main__":
# create the conf, sc objects, then use loadmatrix to read data
conf = SparkConf().setAppName('SVD').setMaster('local')
sc = SparkContext(conf = conf)
mymatrix = loadmatrix(sc)
# this is pseudocode for calling AtransposeA
ATA = mymatrix.map(lambda line: AtransposeA(line)).reduce(elementwise add all the outerproducts)
# the SVD of ATA is computed below
U, S, V = np.linalg.svd(ATA)
# ...
My approach is as follows - to do matrix multiplication A^T * A, I create a function that computes outer products of rows of A. The elementwise sum of all of the outerproducts is the product I want. I then call AtransposeA() in a map function, that way is it performed on each row of the matrix, and finally I use a reduce() to add the resulting matrices.
I'm struggling thinking about how the AtransposeA function should look. How can I do an outerproduct in pyspark like this? Thanks in advance for help!
First, consider why you want to use Spark for this. It sounds like all your data fits in memory, in which case you can use numpy and pandas in a very straight-forward way.
If your data isn't structured so that rows are independent, then it probably can't be parallelized by sending groups of rows to different nodes, which is the whole point of using Spark.
Having said that... here is some pyspark (2.1.1) code that I think does what you want.
# read the matrix file
df = spark.read.csv("matrix1.txt",sep=" ",inferSchema=True)
df.show()
+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|1.2|3.4|2.3|
|2.3|1.1|1.5|
|3.3|1.8|4.5|
|5.3|2.2|4.5|
|9.3|8.1|0.3|
|4.5|4.3|2.1|
+---+---+---+
# do the sum of the multiplication that we want, and get
# one data frame for each column
colDFs = []
for c2 in df.columns:
colDFs.append( df.select( [ F.sum(df[c1]*df[c2]).alias("op_{0}".format(i)) for i,c1 in enumerate(df.columns) ] ) )
# now union those separate data frames to build the "matrix"
mtxDF = reduce(lambda a,b: a.select(a.columns).union(b.select(a.columns)), colDFs )
mtxDF.show()
+------------------+------------------+------------------+
| op_0| op_1| op_2|
+------------------+------------------+------------------+
| 152.45|118.88999999999999| 57.15|
|118.88999999999999|104.94999999999999| 38.93|
| 57.15| 38.93|52.540000000000006|
+------------------+------------------+------------------+
This seems to be the same result that you get from numpy.
a = numpy.genfromtxt("matrix1.txt")
numpy.dot(a.T, a)
array([[ 152.45, 118.89, 57.15],
[ 118.89, 104.95, 38.93],
[ 57.15, 38.93, 52.54]])

scipy.optimize failure with a "vectorized" implementation

I have an optimization problem (1d) coded in 2 ways - one using a for loop and an other using numpy arrays. The for loop version works fine but the numpy one fails.
Actually it is a bit more complicated, it can work with different starting points (!!) or if I choose an other optimization algo like CG.
The 2 versions (functions and gradients) are giving the same results and the returned types are also the same as far as I can tell.
Here is my example, what am I missing?
import numpy as np
from scipy.optimize import minimize
# local params
v1 = np.array([1., 1.])
v2 = np.array([1., 2.])
# local functions
def f1(x):
s = 0
for i in range(len(v1)):
s += (v1[i]*x-v2[i])**2
return 0.5*s/len(v1)
def df1(x):
g = 0
for i in range(len(v1)):
g += v1[i]*(v1[i]*x-v2[i])
return g/len(v1)
def f2(x):
return 0.5*np.sum((v1*x-v2)**2)/len(v1)
def df2(x):
return np.sum(v1*(v1*x-v2))/len(v1)
x0 = 10. # x0 = 2 works
# tests...
assert np.abs(f1(x0)-f2(x0)) < 1.e-6 and np.abs(df1(x0)-df2(x0)) < 1.e-6 \
and np.abs((f1(x0+1.e-6)-f1(x0))/(1.e-6)-df1(x0)) < 1.e-4
# BFGS for f1: OK
o = minimize(f1, x0, method='BFGS', jac=df1)
if not o.success:
print('FAILURE', o)
else:
print('SUCCESS min = %f reached at %f' % (f1(o.x[0]), o.x[0]))
# BFGS for f2: failure
o = minimize(f2, x0, method='BFGS', jac=df2)
if not o.success:
print('FAILURE', o)
else:
print('SUCCESS min = %f reached at %f' % (f2(o.x[0]), o.x[0]))
The error I get is
A1 = I - sk[:, numpy.newaxis] * yk[numpy.newaxis, :] * rhok
IndexError: invalid index to scalar variable.
but I doesn't really helps me since it can work with some other starting values.
I am using an all new fresh python install (python 3.5.2, scipy 0.18.1 and numpy 1.11.3).
The solver expects the return value of jacobian df2 to be the same shape as its input x. Even though you passed in a scalar here, it's actually converted into a single element ndarray. Since you used np.sum, your result became scalar and that causes strange things to happen.
Enclose the scalar result of df2 with np.array, and your code should work.

Training an LSTM neural network to forecast time series in pybrain, python

I have a neural network created using PyBrain and designed to forecast time series.
I am using the sequential dataset function, and trying to use a sliding window of 5 previous values to predict the 6th. One of my problems is that I can't figure out how to create the required dataset by appending the 5 previous values to the inputs and the 6th as an output.
I am also unsure of how exactly to forecast values in the series once the network is trained.
Posting my code below:
from pybrain.datasets import SupervisedDataSet
from pybrain.datasets import SequentialDataSet
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.supervised.trainers import RPropMinusTrainer
from pylab import ion, ioff, figure, draw, contourf, clf, show, hold, plot
from pybrain.structure import RecurrentNetwork
from pybrain.structure import FeedForwardNetwork
from pybrain.structure import LinearLayer, SigmoidLayer, TanhLayer
from pybrain.structure import FullConnection
from pybrain.structure import LSTMLayer
from pybrain.structure import BiasUnit
from pybrain.rl.learners.valuebased import Q
import pybrain
import matplotlib as plt
import translate
import time
import pickle
import scipy as sp
import numpy as np
import pylab as pl
import itertools
#Opening data from database
data = translate.translate(3600)
time, price, volume = zip(*data)
#Creating data lists instead of tuples
timeList = []
priceList = []
volumeList = []
for record in time:
timeList.append(record)
for record in price:
priceList.append(record)
for record in volume:
volumeList.append(record)
#Creating lookback window and target
datain = priceList[:5]
dataout = priceList[6]
print datain
print dataout
#Creating the dataset
ds = SequentialDataSet(5, 1)
for x, y in itertools.izip(datain, dataout):
ds.newSequence()
ds.appendLinked(tuple(x), tuple(y))
print (x, y)
print ds
#Building the network
n = RecurrentNetwork()
#Create the network modules
n.addInputModule(SigmoidLayer(5, name = 'in'))
n.addModule(LSTMLayer(100, name = 'LSTM'))
n.addModule(LSTMLayer(100, name = 'LSTM2'))
n.addOutputModule(SigmoidLayer(1, name = 'out'))
#Add the network connections
n.addConnection(FullConnection(n['in'], n['LSTM'], name = 'c_in_to_LSTM'))
n.addConnection(FullConnection(n['in'], n['LSTM2'], name = 'c_in_to_LSTM2'))
n.addConnection(FullConnection(n['LSTM'], n['out'], name = 'c_LSTM_to_out'))
n.addConnection(FullConnection(n['LSTM2'], n['out'], name = 'c_LSTM2_to_out'))
n.sortModules()
n.randomize()
#Creating the trainer
trainer = BackpropTrainer(n, ds)
#Training the network
#for i in range (1000):
# print trainer.train()
#Make predictions
#Plotting the results
pl.plot(time, price)
pl.show()
The above code gives:
TypeError: izip argument #2 must support iteration
I have seen the question linked below however I haven't been successful
Event Sequences, Recurrent Neural Networks, PyBrain
First question on this great site, any help is appreciated
#Creating lookback window and target
datain = priceList[:5]
dataout = priceList[6]
Not an expert. But it seems your datain is a list with length=6 while dataout is not.
I'd guess the TypeError says it all. Whereas priceList[:5] is a list and hence iterable, priceList[6] is a single element.
You'd probably want something like
datain = priceList[:5]
dataout = priceList[6:6]
which will make dataout a list with a single element.