Is there an function in PyTorch for converting convolutions to fully-connected networks form? - neural-network

I'm trying to convert a convolution layer to a fully-connected layer.
For example, there is an example of 3×3 input and 2x2 kernel:
which is equivalent to a vector-matrix multiplication,
Is there a function in PyTorch to get the matrix B?

I can only partially answer your question:
In your example above, you write the kernel as matrix and the input as a vector. If you are fine with writing the input as a matrix, you can use torch.nn.Unfold which explicitly calculates a convolution in the documentation:
# Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
inp = torch.randn(1, 3, 10, 12)
w = torch.randn(2, 3, 4, 5)
inp_unf = torch.nn.functional.unfold(inp, (4, 5))
out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
out = out_unf.view(1, 2, 7, 8)
(torch.nn.functional.conv2d(inp, w) - out).abs().max()
# tensor(1.9073e-06)
If, however, you need to calculate the matrix for the kernel (the smaller matrix) you can use this function, which is based on Warren Weckessers answer:
def toeplitz_1_ch(kernel, input_size):
# shapes
k_h, k_w = kernel.shape
i_h, i_w = input_size
o_h, o_w = i_h-k_h+1, i_w-k_w+1
# construct 1d conv toeplitz matrices for each row of the kernel
toeplitz = []
for r in range(k_h):
toeplitz.append(linalg.toeplitz(c=(kernel[r,0], *np.zeros(i_w-k_w)), r=(*kernel[r], *np.zeros(i_w-k_w))) )
# construct toeplitz matrix of toeplitz matrices (just for padding=0)
h_blocks, w_blocks = o_h, i_h
h_block, w_block = toeplitz[0].shape
W_conv = np.zeros((h_blocks, h_block, w_blocks, w_block))
for i, B in enumerate(toeplitz):
for j in range(o_h):
W_conv[j, :, i+j, :] = B
W_conv.shape = (h_blocks*h_block, w_blocks*w_block)
return W_conv
which is not in pytorch but in numpy. This is for padding = 0 but can easily be adjusted by changing h_blocks and w_blocks and W_conv[i+j, :, j, :].
Update: Multiple output channels are just multiple of these matrices, as each output has its own kernel. Multiple input channels also have their own kernels - and their own matrices - over which you average after the convolution. This can be implemented as follows:
def conv2d_toeplitz(kernel, input):
"""Compute 2d convolution over multiple channels via toeplitz matrix
Args:
kernel: shape=(n_out, n_in, H_k, W_k)
input: shape=(n_in, H_i, W_i)"""
kernel_size = kernel.shape
input_size = input.shape
output_size = (kernel_size[0], input_size[1] - (kernel_size[1]-1), input_size[2] - (kernel_size[2]-1))
output = np.zeros(output_size)
for i,ks in enumerate(kernel): # loop over output channel
for j,k in enumerate(ks): # loop over input channel
T_k = toeplitz_1_ch(k, input_size[1:])
output[i] += T_k.dot(input[j].flatten()).reshape(output_size[1:]) # sum over input channels
return output
To check the correctness:
k = np.random.randn(4*3*3*3).reshape((4,3,3,3))
i = np.random.randn(3,7,9)
out = conv2d_toeplitz(k, i)
# check correctness of convolution via toeplitz matrix
print(np.sum((out - F.conv2d(torch.tensor(i).view(1,3,7,9), torch.tensor(k)).numpy())**2))
>>> 1.0063523219807736e-28
Update 2:
It is also possible to do this without looping in one matrix:
def toeplitz_mult_ch(kernel, input_size):
"""Compute toeplitz matrix for 2d conv with multiple in and out channels.
Args:
kernel: shape=(n_out, n_in, H_k, W_k)
input_size: (n_in, H_i, W_i)"""
kernel_size = kernel.shape
output_size = (kernel_size[0], input_size[1] - (kernel_size[1]-1), input_size[2] - (kernel_size[2]-1))
T = np.zeros((output_size[0], int(np.prod(output_size[1:])), input_size[0], int(np.prod(input_size[1:]))))
for i,ks in enumerate(kernel): # loop over output channel
for j,k in enumerate(ks): # loop over input channel
T_k = toeplitz_1_ch(k, input_size[1:])
T[i, :, j, :] = T_k
T.shape = (np.prod(output_size), np.prod(input_size))
return T
The input has to be flattened and the output reshaped after multiplication.
Checking for correctness (using the same i and k as above):
T = toeplitz_mult_ch(k, i.shape)
out = T.dot(i.flatten()).reshape((1,4,5,7))
# check correctness of convolution via toeplitz matrix
print(np.sum((out - F.conv2d(torch.tensor(i).view(1,3,7,9), torch.tensor(k)).numpy())**2))
>>> 1.5486060830252635e-28

You can use my code for convolution with circular padding:
import numpy as np
import scipy.linalg as linalg
def toeplitz_1d(k, x_size):
k_size = k.size
r = *k[(k_size // 2):], *np.zeros(x_size - k_size), *k[:(k_size // 2)]
c = *np.flip(k)[(k_size // 2):], *np.zeros(x_size - k_size), *np.flip(k)[:(k_size // 2)]
t = linalg.toeplitz(c=c, r=r)
return t
def toeplitz_2d(k, x_size):
k_h, k_w = k.shape
i_h, i_w = x_size
ks = np.zeros((i_w, i_h * i_w))
for i in range(k_h):
ks[:, i*i_w:(i+1)*i_w] = toeplitz_1d(k[i], i_w)
ks = np.roll(ks, -i_w, 1)
t = np.zeros((i_h * i_w, i_h * i_w))
for i in range(i_h):
t[i*i_h:(i+1)*i_h,:] = ks
ks = np.roll(ks, i_w, 1)
return t
def toeplitz_3d(k, x_size):
k_oc, k_ic, k_h, k_w = k.shape
i_c, i_h, i_w = x_size
t = np.zeros((k_oc * i_h * i_w, i_c * i_h * i_w))
for o in range(k_oc):
for i in range(k_ic):
t[(o * (i_h * i_w)):((o+1) * (i_h * i_w)), (i * (i_h * i_w)):((i+1) * (i_h * i_w))] = toeplitz_2d(k[o, i], (i_h, i_w))
return t
if __name__ == "__main__":
import torch
k = np.random.randint(50, size=(3, 2, 3, 3))
x = np.random.randint(50, size=(2, 5, 5))
t = toeplitz_3d(k, x.shape)
y = t.dot(x.flatten()).reshape(3, 5, 5)
xx = torch.nn.functional.pad(torch.from_numpy(x.reshape(1, 2, 5, 5)), pad=(1, 1, 1, 1), mode='circular')
yy = torch.conv2d(xx, torch.from_numpy(k))
err = ((y - yy.numpy()) ** 2).sum()
print(err)

While the other answers are correct, there is a faster way. In your example, you give an input of size 3x3 with a kernel of size 2x2. And your resulting circulant matrix multiplied by the input image is 9x9x4 operations, or 324 in total. Here is a method that does this with 4 x 4 x 4, or 64 operations in total. We will use Pytorch, but this could be done in Numpy, as well.
Assume an image input of shape (batch, channels, height, width):
import torch
def get_kernel_inputs(image, kernel):
out = torch.empty(image.size()[0], 0, 1, kernel.size()[-2] * kernel.size()[-1])
for k in range(image.size()[-2] - kernel.size()[-2] + 1):
for l in range(image.size()[-1] - kernel.size()[-1] + 1):
out = torch.cat([out,image[:, :, k:k+kernel.size()[-2],l:l + kernel.size()[-1]].reshape(image.size()[0], -1, 1, kernel.size()[-1] * kernel.size()[-2])], dim=1)
return out
Now let's test to see what size out this gives:
img = torch.rand(1, 1, 3, 3)
kernel = torch.rand(2, 2)
kernelized_img = get_kernel_inputs(img, kernel)
print(kernelized_img.size())
This yields a size of:
torch.Size([1, 4, 1, 4])
So there are 16 values stored in the above tensor. Now let's matrix multiply:
print(torch.matmul(kernelized_img, kernel.view(4)))
This is 16 x 4 multiplications.
Finally, let's test that this is, in fact, giving out the correct value by using the Torch Conv2d module:
import torch.nn as nn
mm = nn.Conv2d(1, 1, (2,2), bias=False)
with torch.no_grad():
kernel_test = mm.weight
print("Control ", mm(img))
print("Test", torch.matmul(kernelized_img, kernel_test.view(4)).view(1, 1, 2, 2))
Control tensor([[[[-0.0089, 0.0178],
[-0.1419, 0.2720]]]], grad_fn=<ThnnConv2DBackward>)
Test tensor([[[[-0.0089, 0.0178],
[-0.1419, 0.2720]]]], grad_fn=<ViewBackward>)
All we are doing differently in the above is reshaping the image instead of the kernel.
Setting the image height and width equal and the kernel height and width equal, where
i=image height/width
k=kernel height/width
Then the difference in the number of calculations in the Toeplitz method vs. the above method is:
Edit Addition:
The above implementation only worked on single-channel inputs. For this definition to work on multiple channel inputs and outputs, plus handle batches, can do the following:
def get_kernel_inputs(image, kernel):
out=torch.empty(image.size()[0], image.size()[1], 0, kernel.size()[-2]*kernel.size()[-1])
out_size=[image.size()[-2]-kernel.size()[-2]+1,(image.size()[-1]-kernel.size()[-1]+1)]
for k in range(out_size[0]):
for l in range(out_size[1]):
out=torch.cat([out,image[:,:,k:k+kernel.size()[-2],l:l+kernel.size()[-1]].reshape(image.size()[0],-1,1,kernel.size()[-1]*kernel.size()[-2])],dim=2)
preout=out.permute(0,2,1,3).reshape(image.size()[0],-1,image.size()[1]*kernel.size()[-2]*kernel.size()[-1])
kernel1 = kernel.view(kernel.size()[0], -1)
out = torch.matmul(preout, kernel1.T).permute(0, 2, 1).reshape(image.size()[0], kernel.size()[0],
out_size[0], out_size[1])
return out
images=torch.rand(5, 3, 32, 32)
mm=nn.Conv2d(3, 32, (3, 3), bias=False)
#Set the kernel to Conv2d init for testing
with torch.no_grad():
kernel=mm.weight
print(get_kernel_inputs(images, kernel))
print(mm(images))

Related

L1 regulariser Pytorch acting opposite to what I expect

I'm trying to add an L1 penalty to a specific layer of a neural network, and I have the code below (in which I attempt to add l1 penalty to the first layer). If I run it for lambda = 0 (i.e. no penalty), the output gets very close to the expected weights those being [10, 12, 2, 11, -0.25]) and if I run for enough epochs or reduce batch size it will get it exactly, as in the output below:
mlp.0.weight
Parameter containing:
tensor([[ 9.8657, -11.8305, 2.0242, 10.8913, -0.1978]],
requires_grad=True)
Then, when I run it for a large lambda, say 1000, I would expect these weights to shrink towards zero as there is a large penalty being added to the loss that we are trying to minimise. However, the opposite happens and the weights explode, as in the output below (for lam = 1000)
mlp.0.weight
Parameter containing:
tensor([[-13.9368, 9.9072, 2.2447, -11.6870, 26.7293]],
requires_grad=True)
If anyone could help me, that'd be great. I'm new to pytorch (but not the idea of regularisation), so I'm guessing it's something in my code that is the problem.
Thanks
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.linear_model import LinearRegression
class TrainDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return self.data.shape[0]
def __getitem__(self, ind):
x = self.data[ind][1:]
y = self.data[ind][0]
return x, y
class TestDataset(TrainDataset):
def __getitem__(self, ind):
x = self.data[ind]
return x
torch.manual_seed(94)
x_train = np.random.rand(1000, 5)
y_train = x_train[:, 0] * 10 - x_train[:, 1] * 12 + x_train[:, 2] * 2 + x_train[:, 3] * 11 - x_train[:, 4] * 0.25
y_train = y_train.reshape(1000, 1)
x_train.shape
y_train.shape
train_data = np.concatenate((y_train, x_train), axis=1)
train_set = TrainDataset(train_data)
batch_size = 100
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.mlp = nn.Sequential(nn.Linear(5, 1, bias=False))
def forward(self, x_mlp):
out = self.mlp(x_mlp)
return out
device = 'cpu'
model = MLP()
optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.82)
criterion = nn.MSELoss()
epochs = 5
lam = 0
model.train()
for epoch in range(epochs):
losses = []
for batch_num, input_data in enumerate(train_loader):
optimizer.zero_grad()
x, y = input_data
x = x.to(device).float()
y = y.reshape(batch_size, 1)
y = y.to(device)
output = model(x)
for name, param in model.named_parameters():
if name == 'mlp.0.weight':
l1_norm = torch.norm(param, 1)
loss = criterion(output, y) + lam * l1_norm
loss.backward()
optimizer.step()
print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
for name, param in model.named_parameters():
if param.requires_grad:
print(name)
print(param)
I found that if I use Adagrad as the optimiser instead of SGD, it acts as expected. Will need to look into the difference of those now, but this can be considered answered.

FCN dimension in ConvNet+FCN example

So my neural network defined in pytorch is as follows (taken from https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 3x3 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 3)
self.conv2 = nn.Conv2d(6, 16, 3)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 6 * 6, 120) # 6*6 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
Assuming I want to apply NN on a 1 * 32 * 32 tensor, how does self.fc1 come to be 16 * 6 * 6. I understand the 16, not sure how we get 6.
First off, your convolutions are 3x3 kernels without padding, meaning the data will loss 1 off each side of it's spatial dimensions when passing through. If you're confused as to why, seeing a picture is best:
The blue is your input data and the green the resulting data (one channel each).
Let's step through and see what happens:
Input-> 1x32x32
conv1-> 6x30x30 (lost 2 on each spatial dimension)
max_pool-> 6x15x15 (halves the spatial dimensions)
conv2-> 16x13x13
max_pool-> 16x6x6 (13 doesn't divide evenly)
And there you have it!

Pytorch linear/affine layer parameters confusing

I'm on the Pytorch documentation (https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html) and I'm not really understanding why they are making the the affine layer (16 * 6 * 6, 120). I understand that the last outputs from the convolution layer were 16 and the output here is 120, but even with their annotation, I'm not understanding where the 6 * 6 comes from. Can someone please explain?
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 3x3 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 3)
self.conv2 = nn.Conv2d(6, 16, 3)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 6 * 6, 120) # 6*6 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
net = Net()
print(net)
The 6x6 comes from the height and width of x after it has been passed through your convolutions and maxpools.
Here is a simplified version where you can see how the shape changes at each point. It may help to print out the shapes in their example so you can see exactly how everything changes.
import torch
import torch.nn as nn
import torch.nn.functional as F
conv1 = nn.Conv2d(1, 6, 3)
conv2 = nn.Conv2d(6, 16, 3)
# Making a pretend input similar to theirs.
# We define an input with 1 batch, 1 channel, height 32, width 32
x = torch.ones((1,1,32,32))
# Simulating forward()
x = F.max_pool2d(F.relu(conv1(x)), (2, 2))
print(x.shape) # torch.Size([1, 6, 15, 15]) 1 batch, 6 channels, height 15, width 15
x = F.max_pool2d(F.relu(conv2(x)), 2)
print(x.shape) # torch.Size([1, 16, 6, 6]) 1 batch, 16 channels, height 6, width 6
Next they flatten x and pass it through fc1 which accepts 16*6*6 and produces 120 outputs.

Very slow execution of user defined convolution function for neural network in MATLAB

I have an implementation of a convolution neural network in MATLAB (from the open source DeepLearnToolbox). The following code finds the convolution of different weights and parameters:
z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');
To update the tool, I have implemented my own fixed-point scheme based convolution using the following code:
function result = convolution(image, kernal)
% find dimensions of output
row = size(image,1) - size(kernal,1) + 1;
col = size(image,2) - size(kernal,2) + 1;
zdim = size(image,3);
%create output matrix
output = zeros(row, col);
% flip the kernal
kernal_flipped = fliplr(flipud(kernal));
%find rows and col of kernal for loop iteration
row_ker = size(kernal_flipped,1);
col_ker = size(kernal_flipped,2);
for k = 1 : zdim
for i = 0 : row-1
for j = 0 : col-1
sum = fi(0,1,8,7);
prod = fi(0,1,8,7);
for k_row = 1 : row_ker
for k_col = 1 : col_ker
a = image(k_row+i, k_col+j, k);
b = kernal_flipped(k_row,k_col);
prod = a * b;
% convert to fixed point
prod = fi((product/16384), 1, 8, 7);
sum = fi((sum + prod), 1, 8, 7);
end
end
output(i+1, j+1, k) = sum;
end
end
end
result = output;
end
The problem is that when I use my convolution implementation in the bigger application, it is super slow.
Any suggestions how to improve its execution time?
MATLAB doesn't support fixed point 2D convolution, but knowing that convolution can be written as matrix multiplication and that MATLAB has support for fixed point matrix multiplication you can use im2col to convert the image into column format and multiply it by the kernel to convolve them.
row = size(image,1) - size(kernal,1) + 1;
col = size(image,2) - size(kernal,2) + 1;
zdim = size(image,3);
output = zeros(row, col);
kernal_flipped = fliplr(flipud(kernal));
fi_kernel = fi(kernal_flipped(:).', 1, 8, 7) / 16384;
sz = size(kernal_flipped);
sz_img = size(image);
% Use the generated indexes to convert the image into column format
idx_col = im2col(reshape(1:numel(image)/zdim,sz_img(1:2)),sz,'sliding');
image = reshape(image,[],zdim);
for k = 1:zdim
output(:,:,k) = double(fi_kernel * reshape(image(idx_col,k),size(idx_col)));
end

How is cconv in matlab implemented?

My understanding was, that these three should be the same, however matlab gives completely different results. The first and third one are in sync with what I calculated by hand, the second one is different.
x_1 = [1, 2, 0, 5];
x_2 = [1/2, -1/4, 1, 0, 3/4];
y_2_1 = ifft(fft(x_1, 2) .* fft(x_2, 2))
y_2_2 = cconv(x_2, x_1, 2)
y_2_3 = cconv(x_2(1:2), x_1(1:2), 2)
From the documentation:
The modulo-2 circular convolution is equivalent to splitting the linear convolution into two-element arrays and summing the arrays.
So it is not the same to do
res = cconv(x_2, x_1, 2);
as to do
res2 = cconv(x_2, x_1);
res2 = res(1:2);
The former is equivalent to
res = cconv(x_2, x_1);
res = res(1:2) + res(3:4) + res(5:6) + ...;
(padding with zeros if res is odd in size).
On the other hand,
res3 = ifft(fft(x_1, 2) .* fft(x_2, 2));
is equivalent to
res3 = fft(x_1(1:2)) .* fft(x_2(1:2));
res3 = ifft(res3);
and different from either of the two cconv results.