How to fix incorrect channel size in pytorch neural network? - neural-network

I'm working with the Google utterance dataset in spectrogram form. Each data point has dimension (160, 101). In my data loader, I used batch_size=128. Therefore, each batch has dimension (128, 160, 101).
I use a LeNet model with the following code as the model:
class LeNet(nn.Module):
def __init__(self):
super(LeNet, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16*5*5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 30)
def forward(self, x):
out = F.relu(self.conv1(x))
out = F.max_pool2d(out, 2)
out = F.relu(self.conv2(out))
out = F.max_pool2d(out, 2)
out = out.view(out.size(0), -1)
out = F.relu(self.fc1(out))
out = F.relu(self.fc2(out))
out = self.fc3(out)
return out
I tried unsqueezing the data with dim=3, but got this error:
Traceback (most recent call last):
File "train_speech.py", line 359, in <module>
train_loss, reg_loss, train_acc, cost = train(epoch)
File "train_speech.py", line 258, in train
outputs = (net(inputs))['out']
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/parallel/data_parallel.py", line 166, in forward
return self.module(*inputs[0], **kwargs[0])
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/content/gdrive/My Drive/Colab Notebooks/mixup_erm-master/models/lenet.py", line 15, in forward
out = F.relu(self.conv1(x))
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py", line 443, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py", line 440, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size [6, 1, 5, 5], expected input[128, 160, 101, 1] to have 1 channels, but got 160 channels instead
How do I fix this issue?
EDIT: New Error Message Below
torch.Size([128, 160, 101])
torch.Size([128, 1, 160, 101])
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at /pytorch/c10/core/TensorImpl.h:1156.)
return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
Traceback (most recent call last):
File "train_speech.py", line 363, in <module>
train_loss, reg_loss, train_acc, cost = train(epoch)
File "train_speech.py", line 262, in train
outputs = (net(inputs))['out']
IndexError: too many indices for tensor of dimension 2
I'm unsqueezing the data in each batch. The relevant section of my training code is below. inputs is analogous to x.
print(inputs.shape)
inputs = inputs.unsqueeze(1)
print(inputs.shape)
outputs = (net(inputs))['out']
Edit 2: New Error
Traceback (most recent call last):
File "train_speech.py", line 361, in <module>
train_loss, reg_loss, train_acc, cost = train(epoch)
File "train_speech.py", line 270, in train
loss.backward()
File "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py", line 149, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Function AddmmBackward returned an invalid gradient at index 1 - got [128, 400] but expected shape compatible with [128, 13024]
Edit 3: Train Loop Below
def train(epoch):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
reg_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
if use_cuda:
inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets_a, targets_b, lam,layer, cost = mixup_data(inputs, targets,
args.alpha,args.mixupBatch, use_cuda)
inputs, targets_a, targets_b = map(Variable, (inputs,
targets_a, targets_b))
outputs = net(inputs)
loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)
train_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += (lam * predicted.eq(targets_a.data).cpu().sum().float()
+ (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float())
optimizer.zero_grad()
loss.backward()
optimizer.step()
return (train_loss/batch_idx, reg_loss/batch_idx, 100.*correct/total, cost/batch_idx)

You should expand on axis=1 a.k.a. the channel axis:
>>> x = x.unsqueeze(1)
If you're inside the dataset __getitem__, then it corresponds to axis=0.

Related

"ValueError: max_evals=500 is too low for the Permutation explainer" shap answers me do I have to give more data (photos)?

I want to test the explainability of a multiclass semantic segmentation model, deeplab_v3plus with shap to know which features contribute the most to semantic classification. However I have a ValueError: max_evals=500 is too low when running my file, and I struggle to understand the reason.
import glob
from PIL import Image
import torch
from torchvision import transforms
from torchvision.utils import make_grid
import torchvision.transforms.functional as tf
from deeplab import deeplab_v3plus
import shap
def test(args):
# make a video prez
model = deeplab_v3plus('resnet101', num_classes=args.nclass, output_stride=16, pretrained_backbone=True)
model.load_state_dict(torch.load(args.seg_file,map_location=torch.device('cpu'))) # because no gpu available on sandbox environnement
model = model.to(args.device)
model.eval()
explainer = shap.Explainer(model)
with torch.no_grad():
for i, file in enumerate(args.img_folder):
img = img2tensor(file, args)
pred = model(img)
print(explainer(img))
if __name__ == '__main__':
class Arguments:
def __init__(self):
self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
self.seg_file = "Model_Woodscape.pth"
self.img_folder = glob.glob("test_img/*.png")
self.mean = [0.485, 0.456, 0.406]
self.std = [0.229, 0.224, 0.225]
self.h, self.w = 483, 640
self.nclass = 10
self.cmap = {
1: [128, 64, 128], # "road",
2: [69, 76, 11], # "lanemarks",
3: [0, 255, 0], # "curb",
4: [220, 20, 60], # "person",
5: [255, 0, 0], # "rider",
6: [0, 0, 142], # "vehicles",
7: [119, 11, 32], # "bicycle",
8: [0, 0, 230], # "motorcycle",
9: [220, 220, 0], # "traffic_sign",
0: [0, 0, 0] # "void"
}
args = Arguments()
test(args)
But it returns:
(dee_env) jovyan#jupyter:~/use-cases/Scene_understanding/Code_Woodscape/deeplab_v3+$ python test_shap.py
BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead.
Traceback (most recent call last):
File "/home/jovyan/use-cases/Scene_understanding/Code_Woodscape/deeplab_v3+/test_shap.py", line 85, in <module>
test(args)
File "/home/jovyan/use-cases/Scene_understanding/Code_Woodscape/deeplab_v3+/test_shap.py", line 37, in test
print(explainer(img))
File "/home/jovyan/use-cases/Scene_understanding/Code_Woodscape/deeplab_v3+/dee_env/lib/python3.9/site-packages/shap/explainers/_permutation.py", line 82, in __call__
return super().__call__(
File "/home/jovyan/use-cases/Scene_understanding/Code_Woodscape/deeplab_v3+/dee_env/lib/python3.9/site-packages/shap/explainers/_explainer.py", line 266, in __call__
row_result = self.explain_row(
File "/home/jovyan/use-cases/Scene_understanding/Code_Woodscape/deeplab_v3+/dee_env/lib/python3.9/site-packages/shap/explainers/_permutation.py", line 164, in explain_row
raise ValueError(f"max_evals={max_evals} is too low for the Permutation explainer, it must be at least 2 * num_features + 1 = {2 * len(inds) + 1}!")
ValueError: max_evals=500 is too low for the Permutation explainer, it must be at least 2 * num_features + 1 = 1854721!
In the source code it looks like it's because I don't give enough arguments. I only have three images in my test_img/* folder, is that why?
I have the same issue. A possible solution I found which seems to be working for my case is to replace this line
explainer = shap.Explainer(model)
With this line
explainer = shap.explainers.Permutation(model, max_evals = 1854721)
shap.Explainer by default has algorithm='auto'. From the documentation: shape.Explainer
By default the “auto” options attempts to make the best choice given
the passed model and masker, but this choice can always be overriden
by passing the name of a specific algorithm.
Since 'permutation' has been selected you can directly use shap.explainers.Permutation and set max_evals to the value suggested in the error message above.
Given the high number of your use case, this might take a really long time. I would suggest to use an easier model just for testing the above solution.

When I add workers to neutral network I get an error, pytorch

I have checked a lot of post and none of them seem to work for me. But when I try to add workers to the dataloader in pytorch it just feeds me an error back. I have tired reading it and figuring it out but I can't seem to find a solution. I assume there is something I'm supposed to add to make the workers able to do their job.
I have 64GB of ram, i9-9900k, and a 3080ti. So I don't think its a memory error is it?
I included the error code with 1 worker and 4 workers because they seem to be different.
Also it works with zero workers.
here is the error with 4 workers:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 114, in _main
prepare(preparation_data)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 277, in _fixup_main_from_path
run_name="__mp_main__")
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
Traceback (most recent call last):
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 990, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\queue.py", line 172, in get raise Empty
queue.Empty
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:/Users/14055/Desktop/Class 1 Project/Chegg.py", line 202, in <module>
training()
File "c:/Users/14055/Desktop/Class 1 Project/Chegg.py", line 122, in training
for data, target in load_data.train_loader:
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__ data = self._next_data()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 1186, in _next_data idx, data = self._get_data()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 1142, in _get_data success, data = self._try_get_data()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 1003, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 23204, 7668, 13636, 6132) exited unexpectedly
Error with 1 worker:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 114, in _main
prepare(preparation_data)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 277, in _fixup_main_from_path
run_name="__mp_main__")
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "c:\Users\14055\Desktop\Class 1 Project\Chegg.py", line 202, in <module>
training()
File "c:\Users\14055\Desktop\Class 1 Project\Chegg.py", line 122, in training
for data, target in load_data.train_loader:
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 359, in __iter__
return self._get_iterator()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 305, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 918, in __init__
w.start()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py", line 33, in __init__
prep_data = spawn.get_preparation_data(process_obj._name)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 143, in get_preparation_data
_check_not_importing_main()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 136, in _check_not_importing_main
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
Traceback (most recent call last):
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 990, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\queue.py", line 172, in get
raise Empty
queue.Empty
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:/Users/14055/Desktop/Class 1 Project/Chegg.py", line 202, in <module>
training()
File "c:/Users/14055/Desktop/Class 1 Project/Chegg.py", line 122, in training
for data, target in load_data.train_loader:
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
data = self._next_data()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 1186, in _next_data
idx, data = self._get_data()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 1142, in _get_data
success, data = self._try_get_data()
File "C:\Users\14055\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py", line 1003, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 3372) exited unexpectedly
Code:
from numpy import testing
import torch.cuda
import numpy as np
import time
import array as arr
import os
from datetime import date, datetime
from torchvision import datasets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
torch.cuda.set_device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def load_data():
num_workers = 1
load_data.batch_size = 20
transform = transforms.ToTensor()
train_data = datasets.MNIST(root='data', train=True, download=True, transform=transform)
load_data.train_loader = torch.utils.data.DataLoader(train_data,
batch_size=load_data.batch_size, num_workers=num_workers, pin_memory=True,
shuffle=True)
test_data = datasets.MNIST(root='data', train=False, download=True, transform=transform)
load_data.test_loader = torch.utils.data.DataLoader(test_data,
batch_size=load_data.batch_size, num_workers=num_workers, pin_memory=True,
shuffle=True)
def visualize():
dataiter = iter(load_data.train_loader)
visualize.images, labels = dataiter.next()
visualize.images = visualize.images.numpy()
fig = plt.figure(figsize=(25, 4))
for idx in np.arange(load_data.batch_size):
ax = fig.add_subplot(2, load_data.batch_size/2, idx+1, xticks=[], yticks=[])
ax.imshow(np.squeeze(visualize.images[idx]), cmap='gray')
ax.set_title(str(labels[idx].item()))
#plt.show()
def fig_values():
img = np.squeeze(visualize.images[1])
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(111)
ax.imshow(img, cmap='gray')
width, height = img.shape
thresh = img.max()/2.5
for x in range(width):
for y in range(height):
val = round(img[x][y],2) if img[x][y] !=0 else 0
ax.annotate(str(val), xy=(y,x),
horizontalalignment='center',
verticalalignment='center',
color='white' if img[x][y]<thresh else 'black')
#plt.show()
load_data()
#visualize()
#fig_values()
class NeuralNet(nn.Module):
def __init__(self, gpu = True):
super(NeuralNet, self ).__init__()
self.conv1 = nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(num_features=128)
self.tns1 = nn.Conv2d(in_channels=128, out_channels=4, kernel_size=1, padding=1)
self.conv2 = nn.Conv2d(in_channels=4, out_channels=16, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(num_features=16)
self.pool1 = nn.MaxPool2d(2,2)
self.conv3 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm2d(num_features=16)
self.conv4 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
self.bn4 = nn.BatchNorm2d(num_features=32)
self.pool2 = nn.MaxPool2d(2,2)
self.tns2 = nn.Conv2d(in_channels=32, out_channels=16, kernel_size=1, padding=1)
self.conv5 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
self.bn5 = nn.BatchNorm2d(num_features=16)
self.conv6 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
self.bn6 = nn.BatchNorm2d(num_features=32)
self.conv7 = nn.Conv2d(in_channels=32, out_channels=10, kernel_size=1, padding=1)
self.gpool = nn.AvgPool2d(kernel_size=7)
self.drop = nn.Dropout2d(0.1)
def forward(self, x):
x = self.tns1(self.drop(self.bn1(F.relu(self.conv1(x)))))
x = self.drop(self.bn2(F.relu(self.conv2(x))))
x = self.pool1(x)
x = self.drop(self.bn3(F.relu(self.conv3(x))))
x = self.drop(self.bn4(F.relu(self.conv4(x))))
x = self.tns2(self.pool2(x))
x = self.drop(self.bn5(F.relu(self.conv5(x))))
x = self.drop(self.bn6(F.relu(self.conv6(x))))
x = self.conv7(x)
x = self.gpool(x)
x = x.view(-1, 10)
return F.log_softmax(x).to(device)
#has antioverfit
def training():
model.to(device)
optimizer= torch.optim.SGD(model.parameters(), lr=0.003, weight_decay= 0.00005, momentum = .9, nesterov = True)
n_epochs = 20000
a = np.float64([9,9,9,9,9]) #antioverfit
testing_loss = 0.0
for epoch in range(n_epochs) :
if(testing_loss <= a[4]): # part of anti overfit
train_loss = 0.0
testing_loss = 0.0
model.train().to(device)
for data, target in load_data.train_loader:
optimizer.zero_grad()
data = data.to(device) #gpu
target = target.to(device) #gpu
output = model(data).to(device)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()*data.size(0)
train_loss = train_loss/len(load_data.train_loader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
model.eval().to(device) # Gets Validation loss
train_loss = 0.0
with torch.no_grad():
for data, target in load_data.test_loader:
data = data.to(device)
target = target.to(device)
output = model(data).to(device)
loss =F.nll_loss(output, target)
testing_loss += loss.item()*data.size(0)
testing_loss = testing_loss / len(load_data.test_loader.dataset)
print('Validation loss = ' , testing_loss)
a = np.insert(a,0,testing_loss) # part of anti overfit
a = np.delete(a,5)
print('Validation loss = ' , testing_loss)
def evalution():
test_loss = 0.0
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
model.eval().to(device)
for data, target in load_data.test_loader:
data = data.to(device)
target = target.to(device)
output = model(data).to(device)
loss =F.nll_loss(output, target)
test_loss += loss.item()*data.size(0)
_, pred = torch.max(output, 1)
correct = np.squeeze(pred.eq(target.data.view_as(pred))).to(device)
for i in range(load_data.batch_size):
try:
label = target.data[i]
class_correct[label] += correct[i].item()
class_total[label] += 1
except IndexError:
break
# calculate and print avg test loss
test_loss = test_loss/len(load_data.test_loader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))
for i in range(10):
if class_total[i] > 0:
print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
str(i), 100 * class_correct[i] / class_total[i],
np.sum(class_correct[i]), np.sum(class_total[i])))
else:
print('Test Accuracy of %5s: N/A (no training examples)' )
print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
100. * np.sum(class_correct) / np.sum(class_total),
np.sum(class_correct), np.sum(class_total)))
acc = (
100. * np.sum(class_correct) / np.sum(class_total),
np.sum(class_correct), np.sum(class_total))
name = f"model-{acc}.pt"
name2 = f"model-{acc}.pth"
save_path = os.path.join("models", name)
save_path2 = os.path.join("models", name2)
torch.save(model, save_path)
torch.save(model, save_path2)
model = NeuralNet().to(device)
summary(model, input_size=(1, 28, 28))
training()
evalution()

Tensorflow gradient shape incompatible when using Convolutional Transpose Layer

I was having an issue when trying to create a convolution-deconvolution network. The original image dimensions are 565 * 584 and I'm trying to produce a segmentation of 565 * 584.
While I didn't have an issue before with my network with 1024*1024 images, I have been having some issues with these dimensions. I am getting this issue when computing the gradient:
segmentation_result.shape: (?, 565, 584, 1), targets.shape: (?, 565, 584, 1)
Process Process-1:
Traceback (most recent call last):
\Python\Python35\lib\site-packages\tensorflow\python\framework\tensor_shape.py", line 558, in merge_with
self.assert_is_compatible_with(other)
\Python\Python35\lib\site-packages\tensorflow\python\framework\tensor_shape.py", line 106, in assert_is_compatible_with
other))
ValueError: Dimensions 565 and 566 are not compatible
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
\Python\Python35\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
\Python\Python35\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
.py", line 418, in train
network = Network(net_id = count, weight=pos_weight)
.py", line 199, in __init__
self.train_op = tf.train.AdamOptimizer().minimize(self.cost)
\Python\Python35\lib\site-packages\tensorflow\python\training\optimizer.py", line 315, in minimize
grad_loss=grad_loss)
\Python\Python35\lib\site-packages\tensorflow\python\training\optimizer.py", line 386, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
\Python\Python35\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in gradients
in_grad.set_shape(t_in.get_shape())
\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 443, in set_shape
self._shape = self._shape.merge_with(shape)
\Python\Python35\lib\site-packages\tensorflow\python\framework\tensor_shape.py", line 561, in merge_with
raise ValueError("Shapes %s and %s are not compatible" % (self, other))
ValueError: Shapes (?, 565, 584, 64) and (?, 566, 584, 64) are not compatible
The entire network has 10 convolutional layers and 10 deconvolutional layers. Each deconvolutional layer is a reversed version of the forward layer. Here is an example of the code to produce the convolutional layer:
def create_layer_reversed(self, input, prev_layer=None):
net_id = self.net_id
print(net_id)
with tf.variable_scope('conv', reuse=False):
W = tf.get_variable('W{}_{}_'.format(self.name[-3:], net_id),
shape=(self.kernel_size, self.kernel_size, self.input_shape[3], self.output_channels))
b = tf.Variable(tf.zeros([W.get_shape().as_list()[2]]))
output = tf.nn.conv2d_transpose(
input, W,
tf.stack([tf.shape(input)[0], self.input_shape[1], self.input_shape[2], self.input_shape[3]]),
strides=[1,1,1,1], padding='SAME')
Conv2d.layer_index += 1
output.set_shape([None, self.input_shape[1], self.input_shape[2], self.input_shape[3]])
output = lrelu(tf.add(tf.contrib.layers.batch_norm(output), b))
return output

Loading a pretrained model fails when multiple GPU was used for training

I have trained a network model and saved its weights and architecture via checkpoint = ModelCheckpoint(filepath='weights.hdf5') callback. During training, I am using multiple GPUs by calling the funtion below:
def make_parallel(model, gpu_count):
def get_slice(data, idx, parts):
shape = tf.shape(data)
size = tf.concat([ shape[:1] // parts, shape[1:] ],axis=0)
stride = tf.concat([ shape[:1] // parts, shape[1:]*0 ],axis=0)
start = stride * idx
return tf.slice(data, start, size)
outputs_all = []
for i in range(len(model.outputs)):
outputs_all.append([])
#Place a copy of the model on each GPU, each getting a slice of the batch
for i in range(gpu_count):
with tf.device('/gpu:%d' % i):
with tf.name_scope('tower_%d' % i) as scope:
inputs = []
#Slice each input into a piece for processing on this GPU
for x in model.inputs:
input_shape = tuple(x.get_shape().as_list())[1:]
slice_n = Lambda(get_slice, output_shape=input_shape, arguments={'idx':i,'parts':gpu_count})(x)
inputs.append(slice_n)
outputs = model(inputs)
if not isinstance(outputs, list):
outputs = [outputs]
#Save all the outputs for merging back together later
for l in range(len(outputs)):
outputs_all[l].append(outputs[l])
# merge outputs on CPU
with tf.device('/cpu:0'):
merged = []
for outputs in outputs_all:
merged.append(merge(outputs, mode='concat', concat_axis=0))
return Model(input=model.inputs, output=merged)
With the testing code:
from keras.models import Model, load_model
import numpy as np
import tensorflow as tf
model = load_model('cpm_log/deneme.hdf5')
x_test = np.random.randint(0, 255, (1, 368, 368, 3))
output = model.predict(x = x_test, batch_size=1)
print output[4].shape
I got the error below:
Traceback (most recent call last):
File "cpm_test.py", line 5, in <module>
model = load_model('cpm_log/Jun5_1000/deneme.hdf5')
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 240, in load_model
model = model_from_config(model_config, custom_objects=custom_objects)
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 301, in model_from_config
return layer_module.deserialize(config, custom_objects=custom_objects)
File "/usr/local/lib/python2.7/dist-packages/keras/layers/__init__.py", line 46, in deserialize
printable_module_name='layer')
File "/usr/local/lib/python2.7/dist-packages/keras/utils/generic_utils.py", line 140, in deserialize_keras_object
list(custom_objects.items())))
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2378, in from_config
process_layer(layer_data)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2373, in process_layer
layer(input_tensors[0], **kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 578, in __call__
output = self.call(inputs, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/layers/core.py", line 659, in call
return self.function(inputs, **arguments)
File "/home/muhammed/DEV_LIBS/developments/mocap/pose_estimation/training/cpm/multi_gpu.py", line 12, in get_slice
def get_slice(data, idx, parts):
NameError: global name 'tf' is not defined
By inspecting the error output, I decide that the problem is with the parallelization code. However, I can't resolve the issue.
You may need to use custom_objects to enable loading of the model.
import tensorflow as tf
model = load_model('model.h5', custom_objects={'tf': tf,})

Incompatible shapes on tensorflow.equal() op for correct predictions evaluation

Using the MNIST tutorial of Tensorflow, I try to make a convolutional network for face recognition with the "Database of Faces".
The images size are 112x92, I use 3 more convolutional layer to reduce it to 6 x 5 as adviced here
I'm very new at convolutional network and most of my layer declaration is made by analogy to the Tensorflow MNIST tutorial, it may be a bit clumsy, so feel free to advice me on this.
x_image = tf.reshape(x, [-1, 112, 92, 1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_conv3 = weight_variable([5, 5, 64, 128])
b_conv3 = bias_variable([128])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
h_pool3 = max_pool_2x2(h_conv3)
W_conv4 = weight_variable([5, 5, 128, 256])
b_conv4 = bias_variable([256])
h_conv4 = tf.nn.relu(conv2d(h_pool3, W_conv4) + b_conv4)
h_pool4 = max_pool_2x2(h_conv4)
W_conv5 = weight_variable([5, 5, 256, 512])
b_conv5 = bias_variable([512])
h_conv5 = tf.nn.relu(conv2d(h_pool4, W_conv5) + b_conv5)
h_pool5 = max_pool_2x2(h_conv5)
W_fc1 = weight_variable([6 * 5 * 512, 1024])
b_fc1 = bias_variable([1024])
h_pool5_flat = tf.reshape(h_pool5, [-1, 6 * 5 * 512])
h_fc1 = tf.nn.relu(tf.matmul(h_pool5_flat, W_fc1) + b_fc1)
keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
print orlfaces.train.num_classes # 40
W_fc2 = weight_variable([1024, orlfaces.train.num_classes])
b_fc2 = bias_variable([orlfaces.train.num_classes])
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
My problem appear when the session run the "correct_prediction" op which is
tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
At least I think given the error message:
W tensorflow/core/common_runtime/executor.cc:1027] 0x19369d0 Compute status: Invalid argument: Incompatible shapes: [8] vs. [20]
[[Node: Equal = Equal[T=DT_INT64, _device="/job:localhost/replica:0/task:0/cpu:0"](ArgMax, ArgMax_1)]]
Traceback (most recent call last):
File "./convolutional.py", line 133, in <module>
train_accuracy = accuracy.eval(feed_dict = {x: batch[0], y_: batch[1], keep_prob: 1.0})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 405, in eval
return _eval_using_default_session(self, feed_dict, self.graph, session)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2728, in _eval_using_default_session
return session.run(tensors, feed_dict)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 345, in run
results = self._do_run(target_list, unique_fetch_targets, feed_dict_string)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 419, in _do_run
e.code)
tensorflow.python.framework.errors.InvalidArgumentError: Incompatible shapes: [8] vs. [20]
[[Node: Equal = Equal[T=DT_INT64, _device="/job:localhost/replica:0/task:0/cpu:0"](ArgMax, ArgMax_1)]]
Caused by op u'Equal', defined at:
File "./convolutional.py", line 125, in <module>
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 328, in equal
return _op_def_lib.apply_op("Equal", x=x, y=y, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 633, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1710, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 988, in __init__
self._traceback = _extract_stack()
It looks like the y_conv output a matrix of shape 8 x batch_size instead of number_of_class x batch_size
If I change the batch size from 20 to 10, the error message stay the same but instead [8] vs. [20] I get [4] vs. [10]. So from that I conclude that the problem may come from the y_conv declaration (last line of the code above).
The loss function, optimizer, training, etc declarations is the same as in the MNIST tutorial:
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_conv))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess.run((tf.initialize_all_variables()))
for i in xrange(1000):
batch = orlfaces.train.next_batch(20)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict = {x: batch[0], y_: batch[1], keep_prob: 1.0})
print "Step %d, training accuracy %g" % (i, train_accuracy)
train_step.run(feed_dict = {x: batch[0], y_: batch[1], keep_prob: 0.5})
print "Test accuracy %g" % accuracy.eval(feed_dict = {x: orlfaces.test.images, y_: orlfaces.test.labels, keep_prob: 1.0})
Thanks for reading, have a good day
Well, after a lot debugging, I found that my issue was due to a bad instantiation of the labels. Instead of creating arrays full of zeros and replace one value by one, I created them with random value! Stupid mistake. In case someone wondering what I did wrong there and how I fix it here is the change I made.
Anyway during all the debugging I made, to find this mistake, I found some useful information to debug this kind of problem:
For the cross entropy declaration, the tensorflow's MNIST tutorial use a formula that can lead to NaN value
This formula is
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_conv))
Instead of this, I found two ways to declare it in a safer fashion:
cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y_conv, 1e-10, 1.0)))
or also:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logit, y_))
As mrry says. printing the shape of the tensors can help to detect shape anomaly.
To get the shape of a tensor just call his get_shape() method like this:
print "W shape:", W.get_shape()
user1111929 in this question use a debug print that help me assert where the problem come from.