I am trying to create a custom activation layer base on the MNIST example in Neupy. However, once I apply my custom layer to the network, it stops training. For my custom function, I want to make the input value from a floating point value to a fixed point for both ReLU and Softmax function. Therefore, I create a function call "float_limit", which helps me to change a floating point value to be a fixed point value. My first idea is to use an int() function within my float_limit function. However, it shows type error since int() cannot use for tensor variable. So I change the int() function to be T.floor(), which can do the same work as int(). For the ReLU function works fine after applying the float_limit(). But once apply softmax function, the network stop training. May I ask that how can I fix this problem?
This is my code:
from sklearn import datasets, model_selection
from sklearn.preprocessing import OneHotEncoder
from neupy import environment,algorithms, layers
import numpy as np
from sklearn.model_selection import train_test_split
import theano
import theano.tensor as T
# load data
mnist = datasets.fetch_mldata('MNIST original')
data, target =,
# make one hot
data = data / 255.
data = data - data.mean(axis=0)
target_scaler = OneHotEncoder()
target = target_scaler.fit_transform(target.reshape((-1, 1)))
target = target.todense()
# split data for training and testing
x_train, x_test, y_train, y_test = train_test_split(
train_size=(6. / 7)
# Theano is a main backend for the Gradient Descent based algorithms in NeuPy.
theano.config.floatX = 'float32'
#################### create new transfer function ###########################
################# float limit #####################
# # idea code
# def float_limit(n, b):
# d = 2 ** b
# return int(n * d) / d
def float_limit(n, b):
d = T.floor(2.0) ** b
return T.floor(n * d) / d
################ custom function ##################
################## relu ##################
def relu(x, alpha=0):
if alpha == 0:
x = float_limit(x, 8)
result = 0.5 * (x + abs(x))
return result
x = float_limit(x, 8)
alpha = T.tensor.as_tensor_variable(alpha)
f1 = 0.5 * (1 + alpha)
f2 = 0.5 * (1 - alpha)
return f1 * x + f2 * abs(x)
class custom_relu(layers.ActivationLayer):
def activation_function(self, input_value):
return relu(input_value)
#################### softmax ########################
class custom_softmax(layers.ActivationLayer):
def activation_function(self, input_value):
input_value = float_limit(input_value,8)
return T.nnet.softmax(input_value)
########### start the model architecture ############
network = algorithms.Momentum(
custom_relu(500), #Relu
# Squared(300),
# layers.Relu(300), #Relu
custom_softmax(10), #Softmax
# layers.Sigmoid
# layers.Input(784),
# tansig(500),
# tansig(500),
# print the architecture(Input shape, Layer Type, Output shape)
# train the network
network.train(x_train, y_train, x_test, y_test, epochs=30)
# show the accuracy
from sklearn import metrics
y_predicted = network.predict(x_test).argmax(axis=1)
y_test = np.asarray(y_test.argmax(axis=1)).reshape(len(y_test))
print(metrics.classification_report(y_test, y_predicted))
score = metrics.accuracy_score(y_test, y_predicted)
print("Validation accuracy: {:.2%}".format(score))
# plot the image
from neupy import plots


How can I fix tensor dimension matching error (with 1 unit difference)

I'm trying to run my code for Graph Convolution Network (GCN) in PyTorch with several .csv input files, but I get error below:
RuntimeError: The expanded size of the tensor (732) must match the existing size (731) at non-singleton dimension 0. Target sizes: [732]. Tensor sizes: [731]
here is my code:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from sklearn.metrics import r2_score
import numpy as np
import datetime
import dgl.function as fn
# Below are the graph convolution functions:
# (where each node collects information about nearby nodes)
def gcn_message(edges):
return {'msg' : edges.src['h']}
def gcn_reduce(nodes):
return {'h' : torch.sum(nodes.mailbox['msg'], dim=1)}
# Below is the pytorch module that defines the operations at each graph convolution layer
class gcnLayer(nn.Module):
def __init__(self, in_feats, out_feats):
super(gcnLayer, self).__init__()
self.linear = nn.Linear(in_feats*2, out_feats)
def forward(self, g, inputs):
with g.local_scope():
g.ndata['h'] = inputs # inputs: POI features
g.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h_N'))
h_total =[inputs, h_N], dim=1) # Result (Convoluted POIs) of convolution at a layer is extracted
return self.linear(h_total) # Result is linearly transformed
# Below is the pytorch class (machine learning architectures are initiliazed as classes)
# that defines the the graph convolutional network (GCN) architecture (number of hidden layers, neurons, activation function, etc)
class gcn(torch.nn.Module):
def __init__(self, input, hidden, output):
super(gcn, self).__init__()
# Initially each row in the input has (input) number of elements.
#In other words, each node in the network has (input number of features, i.e.: number of POI types)
self.gcnInput = gcnLayer(input,hidden) # Input size is converted into hidden size
self.gcnHidden = gcnLayer(hidden,hidden) # Hidden size is converted into hidden size
self.gcnOutput = gcnLayer(hidden,output) # Hidden size is converted into desired output size
# Forward function: this function is run when we call the class
def forward(self, g, pois):
y = F.relu(self.gcnInput(g, pois)) # Result of the input layer is sent through activation function
y = F.relu(self.gcnHidden(g, y)) # Result of the hidden layer is sent through activation function
y = F.relu(self.gcnHidden(g, y)) # Result of the hidden layer is sent through activation function (Here, an arbitrary amount of hidden layers can be added)
y = self.gcnOutput(g, y) # Result of the output layer (not activated)
return y
# Below is the pytorch class that defines the the multilayer perceptron (MLP) architecture
# (number of hidden layers, neurons, activation function, etc)
class mlp(torch.nn.Module):
def __init__(self, input, hidden):
super(mlp, self).__init__() #initialize
self.classifier = nn.Sequential( # Sequential is used when combining different layers
nn.Linear(input, hidden), # Input feature matrix is converted into a matrix with shape (hidden) and linearly transformated
nn.ReLU(), # Activation function is applied
nn.Linear(hidden, hidden), # Result of previous layer is linearly transformaed
nn.ReLU(), # Activation function is applied
nn.Linear(hidden, 1)) # At the final layer, one output is given (Trip amount)
def forward(self, x):
x = self.classifier(x) # the input is sent throught the MLP architecture defined above
return x
# Below is the pytorch class that defines the the the combined deep learning architecture
class od(nn.Module):
def __init__(self, gcnInput, gcnHidden, gcnOutput, mlpHidden):
super(od, self).__init__()
self.gcn = gcn(gcnInput, gcnHidden,gcnOutput) # First: GCN
self.mlp = mlp((2*gcnoutput+1), mlpHidden) # Afterwards: MLP
def forward(self, g, pois, costs, indices, q, zoneCount):
y = self.gcn(g,pois) # First, send the input through GCN
p = torch.zeros(len(costs),2*q).cuda() # Prepare a matrix that will have the POI output at origin (size: q), POI output at destination (size: q)
count = 0
for i in range(zoneCount):
for j in range(zoneCount):
p[count][:q] = y[i][:] # POI output at origin (size: q)
p[count][q:] = y[j][:] # POI output at destination (size: q)
count +=1
p = p[indices][:] # Order the input matrix in the order of shuffled zones (or OD pairs)
costs = costs[indices][:] # Order the cost matrix in the order of shuffled zones (or OD pairs)
inputs =, costs), 1).cuda() # Combine POI and cost matrices
y = self.mlp(inputs) # Last, send through MLP
return y
def train(optimizer, model, criterion, pois, costs, labels, indices, zoneCount, gcnOutput):
model.train() # Model is in the training mode (meaning gradients are calculated)
optimizer.zero_grad() # Gradients are zeroed
pred = model(g, pois, costs, indices, gcnOutput, zoneCount) # Get model output as predicted output
loss = criterion(pred, labels) # Calculate loss between prediction and label
loss.backward() # Backpropagate the gradients
optimizer.step() # (I dont fully know what happens with this code)
return loss.item() # Return loss
def test(model, pois, costs, labels, indices, zoneCount, gcnOutput):
model.eval() # Mode is in evaluation mode: no gradients are calcualted
with torch.no_grad(): # In tensorflow if tensor has a parameter "autograd:true" then, gradients are calculated. This code sets the autograd to false for all tensors below
pred = model(g, pois, costs, indices,gcnOutput, zoneCount) # Get prediction
predictions = pred.detach().cpu() # Move prediction tensor from GPU to CPU
r2 = r2_score(labels.cpu(), predictions) # Calculate R2
return r2
def data_collection(key): #Below part gets the data from the files into the program (POIS, nodes, costs, labels). If the file types are different than the ones used in this research, this part should be adjusted.
if key == "mb": #mb: manhattan and brooklyn case
no = 3
no = 2
with open("/nodes.csv".format(key)) as f:
nodeCount = sum(1 for line in f)
print (nodeCount)
with open("/poisInfo.csv".format(key)) as f:
poiCount = sum(1 for line in f)
with open("/zones.csv".format(key)) as f:
zoneCount = sum(1 for line in f)
pois = torch.zeros((nodeCount,poiCount)).cuda()
i = 0
with open('/nodes.csv'.format(key), mode='r') as rx:
r = csv.reader(rx, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in r:
pois[i][:] = torch.FloatTensor([int(i) for i in row[no:]])
i += 1
costs = torch.zeros((zoneCount*zoneCount,1)).cuda()
labels = torch.zeros((zoneCount*zoneCount,1)).cuda()
count = 0
with open('/costsTrips.csv'.format(key), mode='r') as rx:
r = csv.reader(rx, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in r:
costs[count][0] = int(row[2])
labels[count][0] = int(row[3])
count += 1
g = dgl.DGLGraph().to(torch.device('cuda:0')) # dgl: deep graph learning library: We move POIs to the graph for graph convolution
print (nodeCount)
g.add_nodes(nodeCount) # Add nodes to the graph
print (nodeCount)
print (g.number_of_nodes)
with open('/edges.csv'.format(key), mode='r') as rx:
r = csv.reader(rx, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in r:
g.add_edge(int(row[0]), int(row[1])) # If edge exists between 2 nodes, add edge
print('We have %d nodes.' % g.number_of_nodes())
print('We have %d edges.' % g.number_of_edges())
return([g, pois, labels,costs, zoneCount, poiCount])
gcnoutput = 10
keys = ["manhattan", "brooklyn", "mb"]
count = 0
with open("costFinal.csv", mode='w', newline="") as wx:
w = csv.writer(wx, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
w.writerow(["place", "iteration", "split", "r2"])
for key in keys:
[g, pois, labels, costs, zoneCount, poiCount] = data_collection(key)
for iteration in range(1,11): # We test each split ratio with 10 times to get the average
a = np.random.permutation(zoneCount) # randomize the zones
for i in range(1,10):
split = i/10 # Below lines split the training and test subsets
breaker = int(split * zoneCount)
train_zones = a[:breaker]
test_zones = a[breaker:]
train_indices = []
test_indices = []
for z in train_zones:
train_indices += [j for j in range(z * zoneCount, z * zoneCount + zoneCount)]
for z in test_zones:
test_indices += [j for j in range(z * zoneCount, z * zoneCount + zoneCount)]
# model parameters: gcninput, gcnhidden, gcnoutput, mlphidden
model = od(poiCount, 64, gcnoutput, 64).cuda() # construct the model
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # optimizer: adam optimizer
criterion = torch.nn.MSELoss() # loss: mean squared error loss
for epoch in range(1, 11): # Train the algorithm 500 epochs
print (epoch)
loss = train(optimizer, model, criterion, pois, costs, labels[train_indices], train_indices, zoneCount, gcnoutput)
# print(count, - start, key, iteration, i, epoch, loss)
count += 1
r2 = test(model, pois, costs, labels[test_indices], test_indices, zoneCount, gcnoutput) # At the end of the algorithm, test the model and get r2
w.writerow([key, iteration, i*10, r2]) # write key[manhattan,brooklyn,manhattan and brooklyn], iteration[0...9], split ratio[10%...90%], r2 to the file

I want to use Numpy to simulate the inference process of a quantized MobileNet V2 network, but the outcome is different with pytorch realized one

Python version: 3.8
Pytorch version: 1.9.0+cpu
Platform: Anaconda Spyder5.0
To reproduce this problem, just copy every code below to a single file.
The ILSVRC2012_val_00000293.jpg file used in this code is shown below, you also need to download it and then change its destination in the code.
Some background of this problem:
I am now working on a project that aims to develop a hardware accelerator to complete the inference process of the MobileNet V2 network. I used pretrained quantized Pytorch model to simulate the outcome, and the result comes out very well.
In order to use hardware to complete this task, I wish to know every inputs and outputs as well as intermidiate variables during runing this piece of pytorch code. I used a package named torchextractor to fetch the outcomes of first layer, which in this case, is a 3*3 convolution layer.
import numpy as np
import torchvision
import torch
from torchvision import transforms, datasets
from PIL import Image
from torchvision import transforms
import torchextractor as tx
import math
##### Processing of input image
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
test_transform = transforms.Compose([
preprocess = transforms.Compose([
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#image file destination
filename = "D:\Project_UM\MobileNet_VC709\MobileNet_pytorch\ILSVRC2012_val_00000293.jpg"
input_image =
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
#----First verify that the torchextractor class should not influent the inference outcome
# ofmp of layer1 before putting into torchextractor
a,b,c = quantize_tensor(input_batch)# to quantize the input tensor and return an int8 tensor, scale and zero point
input_qa = torch.quantize_per_tensor(torch.tensor(input_batch.clone().detach()), b, c, torch.quint8)# Using quantize_per_tensor method of torch
# Load a quantized mobilenet_v2 model
model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
with torch.no_grad():
output = model_quantized.features[0][0](input_qa)# Ofmp of layer1, datatype : quantized_tensor
# print("FM of layer1 before tx_extractor:\n",output.int_repr())# Ofmp of layer1, datatype : int8 tensor
output1_clone = output.int_repr().detach().numpy()# Clone ofmp of layer1, datatype : ndarray
# ofmp of layer1 after adding torchextractor
model_quantized_ex = tx.Extractor(model_quantized, ["features.0.0"])#Capture of the module inside first layer
model_output, features = model_quantized_ex(input_batch)# Forward propagation
# feature_shapes = {name: f.shape for name, f in features.items()}
# print(features['features.0.0']) # Ofmp of layer1, datatype : quantized_tensor
out1_clone = features['features.0.0'].int_repr().numpy() # Clone ofmp of layer1, datatype : ndarray
if(out1_clone.all() == output1_clone.all()):
print('Model with torchextractor attached output the same value as the original model')
print('Torchextractor method influence the outcome')
Here I define a numpy quantization scheme based on the quantization scheme proposed by
Quantization and Training of Neural Networks for Efficient
Integer-Arithmetic-Only Inference
# Convert a normal regular tensor to a quantized tensor with scale and zero_point
def quantize_tensor(x, num_bits=8):# to quantize the input tensor and return an int8 tensor, scale and zero point
qmin = 0.
qmax = 2.**num_bits - 1.
min_val, max_val = x.min(), x.max()
scale = (max_val - min_val) / (qmax - qmin)
initial_zero_point = qmin - min_val / scale
zero_point = 0
if initial_zero_point < qmin:
zero_point = qmin
elif initial_zero_point > qmax:
zero_point = qmax
zero_point = initial_zero_point
# print(zero_point)
zero_point = int(zero_point)
q_x = zero_point + x / scale
q_x.clamp_(qmin, qmax).round_()
q_x = q_x.round().byte()
return q_x, scale, zero_point
# #############################################################################################
# --------- Simulate the inference process of layer0: conv33 using numpy
# #############################################################################################
# get the input_batch quantized buffer data
input_scale = b.item()
input_zero = c
input_quantized = a[0].detach().numpy()
# get the layer0 output scale and zero_point
output_scale = model_quantized.features[0][0].state_dict()['scale'].item()
output_zero = model_quantized.features[0][0].state_dict()['zero_point'].item()
# get the quantized weight with scale and zero_point
weight_scale = model_quantized.features[0][0].state_dict()["weight"].q_scale()
weight_zero = model_quantized.features[0][0].state_dict()["weight"].q_zero_point()
weight_quantized = model_quantized.features[0][0].state_dict()["weight"].int_repr().numpy()
# print(weight_quantized)
# print(weight_quantized.shape)
# bias_quantized,bias_scale,bias_zero= quantize_tensor(model_quantized.features[0][0].state_dict()["bias"])# to quantize the input tensor and return an int8 tensor, scale and zero point
# print(bias_quantized.shape)
bias = model_quantized.features[0][0].state_dict()["bias"].detach().numpy()
# print(input_quantized)
Then I write a quantized 2D convolution using numpy, hope to figure out every details in pytorch data flow during the inference.
#%% numpy simulated layer0 convolution function define
def conv_cal(input_quantized, weight_quantized, kernel_size, stride, out_i, out_j, out_k):
weight = weight_quantized[out_i]
input = np.zeros((input_quantized.shape[0], kernel_size, kernel_size))
for i in range(weight.shape[0]):
for j in range(weight.shape[1]):
for k in range(weight.shape[2]):
input[i][j][k] = input_quantized[i][stride*out_j+j][stride*out_k+k]
# print(,input))
# print(input,"\n")
# print(weight)
return np.multiply(weight,input).sum()
def QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, kernel_size, stride, padding, ofm_size):
output = np.zeros((weight_quantized.shape[0],ofm_size,ofm_size))
input_quantized_padding = np.full((input_quantized.shape[0],input_quantized.shape[1]+2*padding,input_quantized.shape[2]+2*padding),0)
zero_temp = np.full(input_quantized.shape,input_zero)
input_quantized = input_quantized - zero_temp
for i in range(input_quantized.shape[0]):
for j in range(padding,padding + input_quantized.shape[1]):
for k in range(padding,padding + input_quantized.shape[2]):
input_quantized_padding[i][j][k] = input_quantized[i][j-padding][k-padding]
zero_temp = np.full(weight_quantized.shape, weight_zero)
weight_quantized = weight_quantized - zero_temp
for i in range(output.shape[0]):
for j in range(output.shape[1]):
for k in range(output.shape[2]):
# output[i][j][k] = (weight_scale*input_scale)*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i] #floating_output
output[i][j][k] = weight_scale*input_scale/output_scale*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i]/output_scale + output_zero
output[i][j][k] = round(output[i][j][k])
# int_output
return output
Here I input the same image, weight, and bias together with their zero_point and scale, then compare this "numpy simulated" result to the PyTorch calculated one.
quantized_model_out1_int8 = np.squeeze(features['features.0.0'].int_repr().numpy())
out1_np = QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, 3, 2, 1, 112)"out1_np.npy",out1_np)
for i in range(quantized_model_out1_int8.shape[0]):
for j in range(quantized_model_out1_int8.shape[1]):
for k in range(quantized_model_out1_int8.shape[2]):
if(out1_np[i][j][k] < 0):
out1_np[i][j][k] = 0
flag = np.zeros(quantized_model_out1_int8.shape)
for i in range(quantized_model_out1_int8.shape[0]):
for j in range(quantized_model_out1_int8.shape[1]):
for k in range(quantized_model_out1_int8.shape[2]):
if(quantized_model_out1_int8[i][j][k] == out1_np[i][j][k]):
flag[i][j][k] = 1
out1_np[i][j][k] = 0
quantized_model_out1_int8[i][j][k] = 0
# Compare the simulated result to extractor fetched result, gain the total hit rate
If the "numpy simulated" results are the same as the extracted one, call it a hit. Print the total hit rate, it shows that numpy gets 92% of the values right. Now the problem is, I have no idea why the rest 8% of values come out wrong.
Comparison of two outcomes:
The picture below shows the different values between Numpy one and PyTorch one, the sample channel is index[1]. The left upper corner is Numpy one, and the upright corner is PyTorch one, I have set all values that are the same between them to 0, as you can see, most of the values just have a difference of 1(This can be view as the error brought by the precision loss of fixed point arithmetics), but some have large differences, e.g. the value[1][4], 121 vs. 76 (I don't know why)
Focus on one strange value:
This code is used to replay the calculation process of the value[1][4], originally I was expecting a trial and error process could lead me to solve this problem, to get my wanted number of 76, but no matter how I tried, it didn't output 76. If you want to try this, I paste this code for your convenience.
#%% A test code to check the calculation process
weight_quantized_sample = weight_quantized[2]
M_t = input_scale * weight_scale / output_scale
ifmap_t = np.int32(input_quantized[:,1:4,7:10])
weight_t = np.int32(weight_quantized_sample)
bias_t = bias[2]
bias_q = bias_t/output_scale
res_t = 0
for ch in range(3):
ifmap_offset = ifmap_t[ch]-np.int32(input_zero)
weight_offset = weight_t[ch]-np.int32(weight_zero)
res_ch = np.multiply(ifmap_offset, weight_offset)
res_ch = res_ch.sum()
res_t = res_t + res_ch
res_mul = M_t*res_t
# for n in range(1, 30):
# res_mul = multiply(n, M_t, res_t)
res_t = round(res_mul + output_zero + bias_q)
Could you help me out of this, have been stuck here for a long time.
I implemented my own version of quantized convolution and got from 99.999% to 100% hitrate (and mismatch of a single value is by 1 that I can consider to be a rounding issue). The link on the paper in the question helped a lot.
But I found that your formulas are the same as mine. So I don't know what was your issue. As I understand quantization in pytorch is hardware dependent.
Here is my code:
def my_Conv2dRelu_b2(input_q, conv_layer, output_shape):
input_q: quantized tensor
conv_layer: quantized tensor
output_shape: the pre-computed shape of the result
output = np.zeros(output_shape)
# extract needed float numbers from quantized operations
weights_scale = conv_layer.weight().q_per_channel_scales()
input_scale = input_q.q_scale()
weights_zp = conv_layer.weight().q_per_channel_zero_points()
input_zp = input_q.q_zero_point()
# extract needed convolution parameters
padding = conv_layer.padding
stride = conv_layer.stride
# extract float numbers for results
output_zp = conv_layer.zero_point
output_scale = conv_layer.scale
conv_weights_int = conv_layer.weight().int_repr()
input_int = input_q.int_repr()
biases = conv_layer.bias().numpy()
for k in range(input_q.shape[0]):
for i in range(conv_weights_int.shape[0]):
output[k][i] = manual_convolution_quant(
image_zp=input_zp, image_scale=input_scale,
kernel_zp=weights_zp[i].item(), kernel_scale=weights_scale[i].item(),
result_zp=output_zp, result_scale=output_scale
return output
def manual_convolution_quant(image, kernel, b, padding, stride, image_zp, image_scale, kernel_zp, kernel_scale,
result_zp, result_scale):
H = image.shape[1]
W = image.shape[2]
new_H = H // stride[0]
new_W = W // stride[1]
results = np.zeros([new_H, new_W])
M = image_scale * kernel_scale / result_scale
bias = b / result_scale
paddedIm = np.pad(
[(0, 0), (padding[0], padding[0]), (padding[1], padding[1])],
s = kernel.shape[1]
for i in range(new_H):
for j in range(new_W):
patch = paddedIm[
:, i * stride[0]: i * stride[0] + s, j * stride[1]: j * stride[1] + s
res = M * ((kernel - kernel_zp) * (patch - image_zp)).sum() + result_zp + bias
if res < 0:
res = 0
results[i, j] = round(res)
return results
Code to compare pytorch and my own version.
def calc_hit_rate(array1, array2):
good = (array1 == array2).astype(
all = array1.size
return good / all
# during inference
y2 = model.conv1(y1)
y2_int = torch.int_repr(y2)
y2_int_manual = my_Conv2dRelu_b2(y1, model.conv1, y2.shape)
print(f'y2 hit rate= {calc_hit_rate(y2.int_repr().numpy(), y2_int_manual)}') #hit_rate=1.0

gaussian process regression in multiple dimensions with GPflow

I would like to perform some multivariant regression using gaussian process regression as implemented in GPflow using version 2.
Installed with pip install gpflow==2.0.0rc1
Below is some example code that generates some 2D data and then attempts to fit it with using GPR and the finally computes the difference
between the true input data and the GPR prediction.
Eventually I would like to extend to higher dimensions
and do tests against a validation set to check for over-fitting
and experiment with other kernels and "Automatic Relevance Determination"
but understanding how to get this to work is the first step.
The following code snippet will work in a jupyter notebook.
import gpflow
import numpy as np
import matplotlib
from gpflow.utilities import print_summary
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 6)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def gen_data(X, Y):
make some fake data.
X, Y are np.ndarrays with shape (N,) where
N is the number of samples.
ys = []
for x0, x1 in zip(X,Y):
y = x0 * np.sin(x0*10)
y = x1 * np.sin(x0*10)
y += 1
return np.array(ys)
# generate some fake data
x = np.linspace(0, 1, 20)
X, Y = np.meshgrid(x, x)
X = X.ravel()
Y = Y.ravel()
z = gen_data(X, Y)
#note X.shape, Y.shape and z.shape
#are all (400,) for this case.
# if you would like to plot the data you can do the following
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(X, Y, z, s=100, c='k')
# had to set this
# to avoid the following error
# tensorflow.python.framework.errors_impl.InvalidArgumentError: Cholesky decomposition was not successful. The input might not be valid. [Op:Cholesky]
# setup the kernel
k = gpflow.kernels.Matern52()
# set up GPR model
# I think the shape of the independent data
# should be (400, 2) for this case
XY = np.column_stack([[X, Y]]).T
print(XY.shape) # this will be (400, 2)
m = gpflow.models.GPR(data=(XY, z), kernel=k, mean_function=None)
# optimise hyper-parameters
opt = gpflow.optimizers.Scipy()
def objective_closure():
return - m.log_marginal_likelihood()
opt_logs = opt.minimize(objective_closure,
# predict training set
mean, var = m.predict_f(XY)
# (400, 400)
# I would expect this to be (400,)
# If it was then I could compute the difference
# between the true data and the GPR prediction
# `diff = mean - z`
# but because the shape is not as expected this of course
# won't work.
The shape of z must be (N, 1), whereas in your case it is (N,). However, this is a missing check in GPflow and not your fault.

Subprocessing Data Loading in pytroch into Google Colab

I'm working on training a deep neural network using pytorch and I use DataLoader for preprocessing data and multi-processing purpose over dataset. I set num_workers attribute to positive number like 4 and my batch_size is 8. I train network on Google Colab Environment but when training keep on after few minutes, stop training and get error in reading .PNG files. I think it's memory error and I want to know what is relation between number of GPU and batch_size and num_workers to set up a reasonable relation between them specially in Google Colab .
I think you can follow this page:
It provide a guide of how to set settings of Google Colab.
I try it and feels really fast.
Hope you love it.
Following is the code it provides but I change a bit about install pytorch:
#!/usr/bin/env python
# encoding: utf-8
import sys
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'
!pip install -q{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
input_size = 784 # The image size = 28 x 28 = 784
hidden_size = 500 # The number of nodes at the hidden layer
num_classes = 10 # The number of output classes. In this case, from 0 to 9
num_epochs = 5 # The number of times entire dataset is trained
batch_size = 100 # The size of input data took for one iteration
learning_rate = 1e-3 # The speed of convergence
train_dataset = dsets.MNIST(root='./data',
test_dataset = dsets.MNIST(root='./data',
train_loader =,
test_loader =,
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__() # Inherited from the parent class nn.Module
self.fc1 = nn.Linear(input_size, hidden_size) # 1st Full-Connected Layer: 784 (input data) -> 500 (hidden node)
self.relu = nn.ReLU() # Non-Linear ReLU Layer: max(0,x)
self.fc2 = nn.Linear(hidden_size, num_classes) # 2nd Full-Connected Layer: 500 (hidden node) -> 10 (output class)
def forward(self, x): # Forward pass: stacking each layer together
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
net = Net(input_size, hidden_size, num_classes)
use_cuda = True
if use_cuda and torch.cuda.is_available():
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader): # Load a batch of images with its (index, data, class)
images = Variable(images.view(-1, 28*28)) # Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
labels = Variable(labels)
if use_cuda and torch.cuda.is_available():
images = images.cuda()
labels = labels.cuda()
optimizer.zero_grad() # Intialize the hidden weight to all zeros
outputs = net(images) # Forward pass: compute the output class given a image
loss = criterion(outputs, labels) # Compute the loss: difference between the output class and the pre-given label
loss.backward() # Backward pass: compute the weight
optimizer.step() # Optimizer: update the weights of hidden nodes
if (i+1) % 100 == 0: # Logging
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size,[0]))
correct = 0
total = 0
for images, labels in test_loader:
images = Variable(images.view(-1, 28*28))
if use_cuda and torch.cuda.is_available():
images = images.cuda()
labels = labels.cuda()
outputs = net(images)
_, predicted = torch.max(, 1) # Choose the best class from the output: The class with the best score
total += labels.size(0) # Increment the total count
correct += (predicted == labels).sum() # Increment the correct count
print('Accuracy of the network on the 10K test images: %d %%' % (100 * correct / total)), 'fnn_model.pkl')

How to use keras ImageDataGenerator with a Siamese or Tripple networks

I'm trying to build up both a Siamese neural network and triple neural network on a custom large dataset
Keras has ImageDataGenerator which makes the generation of input data to a regular neural network very easy.
I'm interesting to use ImageDataGenerator or similar ways in order to train a networks with 2(siamese) and 3(triple) inputs.
In mniset keras siamese example, The input generated by a pre-process stage which is done by create_pairs method. I don't think this kind of way fit for a large dataset.
Is it possible to use ImageDataGenerator in this case? What are my other options assuming the data-set is very big?
The idea of DataGenerators is to give fit_generator a stream of data in batches.. hence giving control to you how you want to produce the data, ie whether you load from files or you do some data augmentation like what is done in ImageDataGenerator.
Here I posting the modified version of mniset siamese example with custom DataGenerator, you can work it out from here.
import numpy as np
np.random.seed(1337) # for reproducibility
import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import SGD, RMSprop
from keras import backend as K
class DataGenerator(object):
"""docstring for DataGenerator"""
def __init__(self, batch_sz):
# the data, shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# create training+test positive and negative pairs
digit_indices = [np.where(y_train == i)[0] for i in range(10)]
self.tr_pairs, self.tr_y = self.create_pairs(X_train, digit_indices)
digit_indices = [np.where(y_test == i)[0] for i in range(10)]
self.te_pairs, self.te_y = self.create_pairs(X_test, digit_indices)
self.tr_pairs_0 = self.tr_pairs[:, 0]
self.tr_pairs_1 = self.tr_pairs[:, 1]
self.te_pairs_0 = self.te_pairs[:, 0]
self.te_pairs_1 = self.te_pairs[:, 1]
self.batch_sz = batch_sz
self.samples_per_train = (self.tr_pairs.shape[0]/self.batch_sz)*self.batch_sz
self.samples_per_val = (self.te_pairs.shape[0]/self.batch_sz)*self.batch_sz
def create_pairs(self, x, digit_indices):
'''Positive and negative pair creation.
Alternates between positive and negative pairs.
pairs = []
labels = []
n = min([len(digit_indices[d]) for d in range(10)]) - 1
for d in range(10):
for i in range(n):
z1, z2 = digit_indices[d][i], digit_indices[d][i+1]
pairs += [[x[z1], x[z2]]]
inc = random.randrange(1, 10)
dn = (d + inc) % 10
z1, z2 = digit_indices[d][i], digit_indices[dn][i]
pairs += [[x[z1], x[z2]]]
labels += [1, 0]
return np.array(pairs), np.array(labels)
def next_train(self):
while 1:
self.cur_train_index += self.batch_sz
if self.cur_train_index >= self.samples_per_train:
yield ([ self.tr_pairs_0[self.cur_train_index:self.cur_train_index+self.batch_sz],
def next_val(self):
while 1:
self.cur_val_index += self.batch_sz
if self.cur_val_index >= self.samples_per_val:
yield ([ self.te_pairs_0[self.cur_val_index:self.cur_val_index+self.batch_sz],
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
'''Contrastive loss from Hadsell-et-al.'06
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
seq = Sequential()
seq.add(Dense(128, input_shape=(input_dim,), activation='relu'))
seq.add(Dense(128, activation='relu'))
seq.add(Dense(128, activation='relu'))
return seq
def compute_accuracy(predictions, labels):
'''Compute classification accuracy with a fixed threshold on distances.
return labels[predictions.ravel() < 0.5].mean()
input_dim = 784
nb_epoch = 20
datagen = DataGenerator(batch_size)
# network definition
base_network = create_base_network(input_dim)
input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))
# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model(input=[input_a, input_b], output=distance)
# train
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms)
model.fit_generator(generator=datagen.next_train(), samples_per_epoch=datagen.samples_per_train, nb_epoch=nb_epoch, validation_data=datagen.next_val(), nb_val_samples=datagen.samples_per_val)