I created this simplified version of VGG16:
class VGG16COMBO(nn.Module):
def __init__(self, num_classes):
super(VGG16COMBO, self).__init__()
# calculate same padding:
# (w - k + 2*p)/s + 1 = o
# => p = (s(o-1) - w + k)/2
self.block_1 = nn.Sequential(
nn.Conv2d(in_channels=1,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
# (1(32-1)- 32 + 3)/2 = 1
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_2 = nn.Sequential(
nn.Conv2d(in_channels=64,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_3 = nn.Sequential(
nn.Conv2d(in_channels=128,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_4 = nn.Sequential(
nn.Conv2d(in_channels=256,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.classifier = nn.Sequential(
nn.Linear(2048, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, num_classes),
)
def forward(self, m, c):
m = self.block_1(m)
m = self.block_2(m)
m = self.block_3(m)
m = self.block_4(m)
m = m.view(m.size(0), -1)
m = self.classifier(m)
c = self.block_1(c)
c = self.block_2(c)
c = self.block_3(c)
c = self.block_4(c)
c = c.view(c.size(0), -1)
c = self.classifier(c)
x = torch.cat((m, c), dim=1)
return x
You can see that in forward I pass 2 elements, m and c. m is referred to MNIST, and c to CIFAR10, because i want a multi input neural network (or a network with shared weights).
Then:
modelcombo = VGG16COMBO(1).cuda()
print(modelcombo)
# Define an optimizier
import torch.optim as optim
optimizer = optim.SGD(modelcombo.parameters(), lr = 0.01)
# Define a loss
criterion = nn.BCEWithLogitsLoss()
This is my training function:
#train da modificare con entrambi i dataset
def train(net, loaders, optimizer, criterion, epochs=20, dev=dev, save_param = False, model_name="valerio"):
try:
net = net.to(dev)
#print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Store the best val accuracy
best_val_accuracy = 0
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
if split == "train":
net.train()
else:
net.eval()
# Process each batch
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
#pred = pred.squeeze(dim=1) # Output shape is [Batch size, 1], but we want [Batch size]
labels = labels.unsqueeze(1)
labels = labels.float()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
#pred_labels = pred.argmax(1) + 1
pred_labels = (pred >= 0.5).long() # Binarize predictions to 0 and 1
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Store params at the best validation accuracy
if save_param and epoch_accuracy["val"] > best_val_accuracy:
#torch.save(net.state_dict(), f"{net.__class__.__name__}_best_val.pth")
torch.save(net.state_dict(), f"{model_name}_best_val.pth")
best_val_accuracy = epoch_accuracy["val"]
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
But when I do the training
# Train model
train(modelcombo, loaders, optimizer, criterion, epochs=10, dev=dev)
I obtain this error:
TypeError: forward() missing 1 required positional argument: 'c'
What I have to change, the net or the training function? I think the problem is in the training function, because I have to pass loaders, and loaders_cifar, but I don't know how. In particular, I have to cat loaders of mnist and loaders of cifar before passing them to the training function, or I have to modify the for (input, labels) in loaders[split]: in something like for (input, labels) in loaders[split] and loaders_cifar[split]:?
EDIT: I created this function:
def itr_merge(*itrs):
for itr in itrs:
for v in itr:
yield v
Edited the training function in this way:
#train da modificare con entrambi i dataset
def train2(net, loaders, loaders_cifar, optimizer, criterion, epochs=20, dev=dev, save_param = False, model_name="valerio"):
try:
net = net.to(dev)
#print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Store the best val accuracy
best_val_accuracy = 0
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
if split == "train":
net.train()
else:
net.eval()
# Process each batch
for x in itr_merge(loaders[split], loaders_cifar[split]):
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
#pred = pred.squeeze(dim=1) # Output shape is [Batch size, 1], but we want [Batch size]
labels = labels.unsqueeze(1)
labels = labels.float()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
#pred_labels = pred.argmax(1) + 1
pred_labels = (pred >= 0.5).long() # Binarize predictions to 0 and 1
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Store params at the best validation accuracy
if save_param and epoch_accuracy["val"] > best_val_accuracy:
#torch.save(net.state_dict(), f"{net.__class__.__name__}_best_val.pth")
torch.save(net.state_dict(), f"{model_name}_best_val.pth")
best_val_accuracy = epoch_accuracy["val"]
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
But I have still the same error
Yes if you have 2 inputs of data points , then pass 2 arguments here
pred = net(input1,input2) #input1 ---> mnist ,input2 ---> cifar
Related
I have Alexnet neural network:
class AlexNet(nn.Module):
def __init__(self, num_classes=100):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), 256 * 6 * 6)
x = self.classifier(x)
return x
I am trying to get the information of the intermediate layers (for example the penultimate layer ) with backward hook but I couldn't get it
According to this answer
You have to split your model in different parts and create methods to access them parts such as :
class AlexNet(nn.Module):
def __init__(self, num_classes=100):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def getFeatures(self,x):
x = self.features(x)
return x.view(x.size(0), 256 * 6 * 6)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), 256 * 6 * 6)
x = self.classifier(x)
return x
This way is quite common and you can find plenty of examples.
I am working on SVHN dataset, and I got this error, only during training phase. During the instantiation of the model, it works.
RuntimeError: Given groups=1, weight of size [64, 3, 3, 3], expected input[64, 1, 32, 32] to have 3 channels, but got 1 channels instead
To be sure of having 3 channels, I wrote that Grayscale(3) transformation.
# Compose transformations
data_transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.Grayscale(num_output_channels=3),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
])
# Compose transformations
test_transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.Grayscale(num_output_channels=3),
transforms.ToTensor(),
])
# Load MNIST dataset with transforms
train_set_svhn = torchvision.datasets.SVHN(root=base_dir, split='train', download=True, transform=data_transform, target_transform=None)
test_set_svhn = torchvision.datasets.SVHN(root=base_dir, split='test', download=True, transform=test_transform)
class VGG16(nn.Module):
def __init__(self, num_classes):
super(VGG16, self).__init__()
# calculate same padding:
# (w - k + 2*p)/s + 1 = o
# => p = (s(o-1) - w + k)/2
self.block_1 = nn.Sequential(
nn.Conv2d(in_channels=3,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
# (1(32-1)- 32 + 3)/2 = 1
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_2 = nn.Sequential(
nn.Conv2d(in_channels=64,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_3 = nn.Sequential(
nn.Conv2d(in_channels=128,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.classifier = nn.Sequential(
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, num_classes),
)
for m in self.modules():
if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
# nn.init.xavier_normal_(m.weight)
if m.bias is not None:
m.bias.detach().zero_()
# self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
def forward(self, x):
x = self.block_1(x)
x = self.block_2(x)
x = self.block_3(x)
# x = self.avgpool(x)
x = x.view(x.size(0), -1)
logits = self.classifier(x)
probas = F.softmax(logits, dim=1)
# probas = nn.Softmax(logits)
return probas
# return logits
I have no idea where that 1 comes from
Moreover, this is the shape of the output of the model without classifier (fully connected layers):
output = model1(test_x)
output.shape
torch.Size([1, 256, 4, 4])
And indeed I will pass 256x4x4 as input to the first FC.
Edit:
I encountered similar problems, but I did not use the MNIST dataset. I confirmed that I used the input file of (256256,3), and the following is my script file.
RuntimeError: Given groups=1, weight of size [64, 3, 4, 4], expected input[2, 2, 64, 64] to have 3 channels, but got 2 channels instead[enter image description here][1]
I am trying to do binary classification on MNIST dataset. Class 0 for even numbers and class 1 for odd numbers. I am using a simplified version of VGG.
My NN has a loss and an accuracy that remain costant.
I want to say that my model, reached to over 90% of accuracy before of changing targets into binary targets, so probably there is something wrong.
Here I change the targets into binary:
for i in range(10):
idx = (train_set.targets==i)
if (i == 0) or ((i % 2) == 0): train_set.targets[idx] = 0
else: train_set.targets[idx] = 1
for i in range(10):
idx = (test_set.targets==i)
if (i == 0) or ((i % 2) == 0): test_set.targets[idx] = 0
else: test_set.targets[idx] = 1
This is my net:
class VGG16(nn.Module):
def __init__(self, num_classes):
super(VGG16, self).__init__()
# calculate same padding:
# (w - k + 2*p)/s + 1 = o
# => p = (s(o-1) - w + k)/2
self.block_1 = nn.Sequential(
nn.Conv2d(in_channels=1,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
# (1(32-1)- 32 + 3)/2 = 1
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_2 = nn.Sequential(
nn.Conv2d(in_channels=64,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_3 = nn.Sequential(
nn.Conv2d(in_channels=128,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_4 = nn.Sequential(
nn.Conv2d(in_channels=256,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.classifier = nn.Sequential(
nn.Linear(2048, 4096),
nn.ReLU(True),
nn.Dropout(p=0.65),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.65),
nn.Linear(4096, num_classes),
nn.Sigmoid()
)
for m in self.modules():
if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
# nn.init.xavier_normal_(m.weight)
if m.bias is not None:
m.bias.detach().zero_()
# self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
def forward(self, x):
x = self.block_1(x)
x = self.block_2(x)
x = self.block_3(x)
x = self.block_4(x)
# x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
#logits = self.classifier(x)
#probas = F.softmax(logits, dim=1)
# probas = nn.Softmax(logits)
#return probas
# return logits
# Define an optimizier
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr = 0.01)
# Define a loss
criterion = nn.BCELoss()
def train(net, loaders, optimizer, criterion, epochs=20, dev=dev, save_param = False, model_name="valerio"):
try:
net = net.to(dev)
#print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Store the best val accuracy
best_val_accuracy = 0
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
if split == "train":
net.train()
else:
net.eval()
# Process each batch
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
labels = labels.unsqueeze(1)
labels = labels.float()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
_,pred_labels = pred.max(1)
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Store params at the best validation accuracy
if save_param and epoch_accuracy["val"] > best_val_accuracy:
#torch.save(net.state_dict(), f"{net.__class__.__name__}_best_val.pth")
torch.save(net.state_dict(), f"{model_name}_best_val.pth")
best_val_accuracy = epoch_accuracy["val"]
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
From the previous model of digit recognition i changed only the targets, and the final layer of classifier from 10 classes to 1 class + Sigmoid. And i changed also cross entropy to BCELoss. What I am doing wrong?
These are loss and accuracy values:
Epoch 1: TrL=49.0955, TrA=31.4211, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 2: TrL=49.0992, TrA=31.4235, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 3: TrL=49.0899, TrA=31.4176, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 4: TrL=49.0936, TrA=31.4199, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 5: TrL=49.0936, TrA=31.4199, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 6: TrL=49.0825, TrA=31.4128, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
What's wrong? How is it possible that with 10 classes I reached over 90% accuracy, and with a simplified version, only 2 classes, I reach 30% of accuracy?
Edit: increasing batch size from 64 to 128, accuracy reaches to 60% and remains constant...
In my opinion, the problem is different representations of odd and even numbers. Let's take 1, 3 pictures with this number are sundry, and convolution neural networks have a problem with extract features. The neural network has 90% accuracy, with 10 classes, so why you need to convert this into 2. If you know that number is 1, 3, 5, 7, 9, you know that it's odd.
I am training a neural network to learn to calculate moving averages. The input is 5 day values and the output is moving average of the 5 days. But the NN is unable to learn it. It is giving a constant value for all predictions. The code is given below.
X_train = []
y_train = []
n = 2000
for i in range(5, n):
X_train.append(df.iloc[i-5:i, 0])
y_train.append(df.iloc[i-1, 2])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
regressor = Sequential()
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))
regressor.add(Dense(units = 1))
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')
regressor.fit(X_train, y_train, epochs = 250, batch_size = 32)
What am I missing?
I would like to know the difference between batch normalization and self normalized neural network. In other words, would SELU (Scaled Exponential Linear Unit) replace batch normalization and how?
Moreover, I after looking into the values of the SELU activations, they were in the range: [-1, 1]. While this is not the case with batch normalization. Instead, the values after the BN layer (before the relu activation), took the values of [-a, a] Approximately, and not [-1, 1].
Here is how I printed the values after the SELU activation and after batch norm layer:
batch_norm_layer = tf.Print(batch_norm_layer,
data=[tf.reduce_max(batch_norm_layer), tf.reduce_min(batch_norm_layer)],
message = name_scope + ' min and max')
And similar code for the SELU activations...
Batch norm layer is defined as follows:
def batch_norm(x, n_out, phase_train, in_conv_layer = True):
with tf.variable_scope('bn'):
beta = tf.Variable(tf.constant(0.0, shape=n_out),
name='beta', trainable=True)
gamma = tf.Variable(tf.constant(1.0, shape=n_out),
name='gamma', trainable=True)
if in_conv_layer:
batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
else:
batch_mean, batch_var = tf.nn.moments(x, [0, 1], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.9999)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
Therefore, since batch norm outputs higher values, the loss increases dramatically, and thus I got nans.
In addition, I tried reducing the learning rate with batch norm, but, that didn't help as well. So how to fix this problem???
Here is the following code:
import tensorflow as tf
import numpy as np
import os
import cv2
batch_size = 32
num_epoch = 102
latent_dim = 100
def weight_variable(kernal_shape):
weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True,
initializer=tf.truncated_normal_initializer(stddev=0.02))
return weights
def bias_variable(shape):
initial = tf.constant(0.0, shape=shape)
return tf.Variable(initial)
def batch_norm(x, n_out, phase_train, convolutional = True):
with tf.variable_scope('bn'):
exp_moving_avg = tf.train.ExponentialMovingAverage(decay=0.9999)
beta = tf.Variable(tf.constant(0.0, shape=n_out),
name='beta', trainable=True)
gamma = tf.Variable(tf.constant(1.0, shape=n_out),
name='gamma', trainable=True)
if convolutional:
batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
else:
batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')
update_moving_averages = exp_moving_avg.apply([batch_mean, batch_var])
m = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_mean), lambda: batch_mean)
v = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_var), lambda: batch_var)
normed = tf.nn.batch_normalization(x, m, v, beta, gamma, 1e-3)
normed = tf.Print(normed, data=[tf.shape(normed)], message='size of normed?')
return normed, update_moving_averages # Note that we should run the update_moving_averages with sess.run...
def conv_layer(x, w_shape, b_shape, padding='SAME'):
W = weight_variable(w_shape)
tf.summary.histogram("weights", W)
b = bias_variable(b_shape)
tf.summary.histogram("biases", b)
# Note that I used a stride of 2 on purpose in order not to use max pool layer.
conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b
conv_batch_norm, update_moving_averages = batch_norm(conv, b_shape, phase_train=tf.cast(True, tf.bool))
name_scope = tf.get_variable_scope().name
conv_batch_norm = tf.Print(conv_batch_norm,
data=[tf.reduce_max(conv_batch_norm), tf.reduce_min(conv_batch_norm)],
message = name_scope + ' min and max')
activations = tf.nn.relu(conv_batch_norm)
tf.summary.histogram("activations", activations)
return activations, update_moving_averages
def deconv_layer(x, w_shape, b_shape, padding="SAME", activation='selu'):
W = weight_variable(w_shape)
tf.summary.histogram("weights", W)
b = bias_variable(b_shape)
tf.summary.histogram('biases', b)
x_shape = tf.shape(x)
out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]])
if activation == 'selu':
conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
conv_trans_batch_norm, update_moving_averages = \
batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool))
transposed_activations = tf.nn.relu(conv_trans_batch_norm)
else:
conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
conv_trans_batch_norm, update_moving_averages = \
batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool))
transposed_activations = tf.nn.sigmoid(conv_trans_batch_norm)
tf.summary.histogram("transpose_activation", transposed_activations)
return transposed_activations, update_moving_averages
tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"]
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue')
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
'annotation_raw': tf.FixedLenFeature([], tf.string)
})
# This is how we create one example, that is, extract one example from the database.
image = tf.decode_raw(features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)
# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image = tf.reshape(image, [height, width, 3])
annotation = tf.cast(features['annotation_raw'], tf.string)
min_after_dequeue = 100
num_threads = 1
capacity = min_after_dequeue + num_threads * batch_size
label_batch, images_batch = tf.train.batch([annotation, image],
shapes=[[], [112, 112, 3]],
batch_size=batch_size,
capacity=capacity,
num_threads=num_threads)
label_batch_splitted = tf.string_split(label_batch, delimiter=',')
label_batch_values = tf.reshape(label_batch_splitted.values, [batch_size, -1])
label_batch_numbers = tf.string_to_number(label_batch_values, out_type=tf.float32)
confidences = tf.slice(label_batch_numbers, begin=[0, 2], size=[-1, 1])
images_batch = tf.cast([images_batch], tf.float32)[0] # Note that casting the image will increases its rank.
with tf.name_scope('image_normal'):
images_batch = tf.map_fn(lambda img: tf.image.per_image_standardization(img), images_batch)
#images_batch = tf.Print(images_batch, data=[tf.reduce_max(images_batch), tf.reduce_min(images_batch)],
# message='min and max in images_batch')
with tf.variable_scope('conv1'):
conv1, uma_conv1 = conv_layer(images_batch, [4, 4, 3, 32], [32]) # image size: [56, 56]
with tf.variable_scope('conv2'):
conv2, uma_conv2 = conv_layer(conv1, [4, 4, 32, 64], [64]) # image size: [28, 28]
with tf.variable_scope('conv3'):
conv3, uma_conv3 = conv_layer(conv2, [4, 4, 64, 128], [128]) # image size: [14, 14]
with tf.variable_scope('conv4'):
conv4, uma_conv4 = conv_layer(conv3, [4, 4, 128, 256], [256]) # image size: [7, 7]
conv4_reshaped = tf.reshape(conv4, [-1, 7 * 7 * 256], name='conv4_reshaped')
w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu')
b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu')
w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig')
b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig')
epsilon = tf.random_normal([1, latent_dim])
tf.summary.histogram('weights_c_mu', w_c_mu)
tf.summary.histogram('biases_c_mu', b_c_mu)
tf.summary.histogram('weights_c_sig', w_c_sig)
tf.summary.histogram('biases_c_sig', b_c_sig)
with tf.variable_scope('mu'):
mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu)
tf.summary.histogram('mu', mu)
with tf.variable_scope('stddev'):
stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig)
tf.summary.histogram('stddev', stddev)
with tf.variable_scope('z'):
latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon)
tf.summary.histogram('features_sig', stddev)
w_dc = tf.Variable(tf.truncated_normal([latent_dim, 7 * 7 * 256], stddev=0.1), name='weights_dc')
b_dc = tf.Variable(tf.constant(0.0, shape=[7 * 7 * 256]), name='biases_dc')
tf.summary.histogram('weights_dc', w_dc)
tf.summary.histogram('biases_dc', b_dc)
with tf.variable_scope('deconv4'):
deconv4 = tf.nn.bias_add(tf.matmul(latent_var, w_dc), b_dc)
deconv4_batch_norm, uma_deconv4 = \
batch_norm(deconv4, [7 * 7 * 256], phase_train=tf.cast(True, tf.bool), convolutional=False)
deconv4 = tf.nn.relu(deconv4_batch_norm)
deconv4_reshaped = tf.reshape(deconv4, [-1, 7, 7, 256], name='deconv4_reshaped')
with tf.variable_scope('deconv3'):
deconv3, uma_deconv3 = deconv_layer(deconv4_reshaped, [3, 3, 128, 256], [128], activation='selu')
with tf.variable_scope('deconv2'):
deconv2, uma_deconv2 = deconv_layer(deconv3, [3, 3, 64, 128], [64], activation='selu')
with tf.variable_scope('deconv1'):
deconv1, uma_deconv1 = deconv_layer(deconv2, [3, 3, 32, 64], [32], activation='selu')
with tf.variable_scope('deconv_image'):
deconv_image_batch, uma_deconv = deconv_layer(deconv1, [3, 3, 3, 32], [3], activation='sigmoid')
# loss function.
with tf.name_scope('loss_likelihood'):
# temp1 shape: [32, 112, 112, 3]
temp1 = images_batch * tf.log(deconv_image_batch + 1e-9) + (1 - images_batch) * tf.log(1 - deconv_image_batch + 1e-9)
#temp1 = temp1 * confidences. This will give an error. Therefore, we should expand the dimension of confidence tensor
confidences_ = tf.expand_dims(tf.expand_dims(confidences, axis=1), axis=1) # shape: [32, 1, 1, 1].
temp1 = temp1 * confidences_
log_likelihood = -tf.reduce_sum(temp1, reduction_indices=[1, 2, 3])
log_likelihood_total = tf.reduce_sum(log_likelihood)
#l2_loss = tf.reduce_mean(tf.abs(tf.subtract(images_batch, deconv_image_batch)))
with tf.name_scope('loss_KL'):
# temp2 shape: [32, 200]
temp2 = 1 + tf.log(tf.square(stddev + 1e-9)) - tf.square(mu) - tf.square(stddev)
temp3 = temp2 * confidences # confidences shape is [32, 1]
KL_term = - 0.5 * tf.reduce_sum(temp3, reduction_indices=1)
KL_term_total = tf.reduce_sum(KL_term)
with tf.name_scope('total_loss'):
variational_lower_bound = tf.reduce_mean(log_likelihood + KL_term)
tf.summary.scalar('loss', variational_lower_bound)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(0.00001).minimize(variational_lower_bound)
init_op = tf.group(tf.local_variables_initializer(),
tf.global_variables_initializer())
saver = tf.train.Saver()
model_path = 'C:/Users/user/PycharmProjects/VariationalAutoEncoder/' \
'VariationalAutoEncoderFaces/tensorboard_logs/Graph_model/ckpt'
# Here is the session...
with tf.Session() as sess:
train_writer = tf.summary.FileWriter('C:/Users/user/PycharmProjects/VariationalAutoEncoder/'
'VariationalAutoEncoderFaces/tensorboard_logs/Event_files', sess.graph)
merged = tf.summary.merge_all()
# Note that init_op should start before the Coordinator and the thread otherwise, this will throw an error.
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
step = 0
to_run_list = [uma_conv1, uma_conv2, uma_conv3, uma_conv4, uma_deconv1, uma_deconv2, uma_deconv3,
uma_deconv4, uma_deconv, optimizer, variational_lower_bound, merged,
deconv_image_batch, image]
# Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved
# under tensorboard_logs.
ckpt = tf.train.get_checkpoint_state(
os.path.dirname(model_path))
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print('checkpoints are saved!!!')
else:
print('No stored checkpoints')
epoch = 0
while not coord.should_stop():
_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, loss, summary, reconstructed_image, original_image = \
sess.run(to_run_list)
print('total loss:', loss)
original_image = cv2.cvtColor(np.array(original_image), cv2.COLOR_RGB2BGR)
reconstructed_image = cv2.cvtColor(np.array(reconstructed_image[0]), cv2.COLOR_RGB2BGR)
cv2.imshow('original_image', original_image)
cv2.imshow('reconstructed_image', reconstructed_image)
cv2.waitKey(1)
if step % 234 == 0:
epoch += 1
print('epoch:', epoch)
if epoch == num_epoch - 2:
coord.request_stop()
if step % 100 == 0:
train_writer.add_summary(summary, step)
#print('total loss:', loss)
#print('log_likelihood_', log_likelihood_)
#print('KL_term', KL_term_)
step += 1
save_path = saver.save(sess, model_path)
coord.request_stop()
coord.join(threads)
train_writer.close()
Any help is much appreciated!!
Here are some sample codes to show the trend of means and variances over 3 SELU layers. The numbers of nodes on the layers (including the input layer) are [15, 30, 30, 8]
import tensorflow as tf
import numpy as np
import os
#-----------------------------------------------#
# https://github.com/bioinf-jku/SNNs/blob/master/selu.py
# The SELU activation function
def selu(x):
with ops.name_scope('elu') as scope:
alpha = 1.6732632423543772848170429916717
scale = 1.0507009873554804934193349852946
return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x))
#-----------------------------------------------#
# https://github.com/bioinf-jku/SNNs/blob/master/selu.py
# alpha-dropout
def dropout_selu(x, rate, alpha= -1.7580993408473766, fixedPointMean=0.0, fixedPointVar=1.0,
noise_shape=None, seed=None, name=None, training=False):
"""Dropout to a value with rescaling."""
def dropout_selu_impl(x, rate, alpha, noise_shape, seed, name):
keep_prob = 1.0 - rate
x = ops.convert_to_tensor(x, name="x")
if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
raise ValueError("keep_prob must be a scalar tensor or a float in the "
"range (0, 1], got %g" % keep_prob)
keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob")
keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
alpha = ops.convert_to_tensor(alpha, dtype=x.dtype, name="alpha")
alpha.get_shape().assert_is_compatible_with(tensor_shape.scalar())
if tensor_util.constant_value(keep_prob) == 1:
return x
noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
random_tensor = keep_prob
random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype)
binary_tensor = math_ops.floor(random_tensor)
ret = x * binary_tensor + alpha * (1-binary_tensor)
a = math_ops.sqrt(fixedPointVar / (keep_prob *((1-keep_prob) * math_ops.pow(alpha-fixedPointMean,2) + fixedPointVar)))
b = fixedPointMean - a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
ret = a * ret + b
ret.set_shape(x.get_shape())
return ret
with ops.name_scope(name, "dropout", [x]) as name:
return utils.smart_cond(training,
lambda: dropout_selu_impl(x, rate, alpha, noise_shape, seed, name),
lambda: array_ops.identity(x))
#-----------------------------------------------#
# build a 3-layer dense network with SELU activation and alpha-dropout
sess = tf.InteractiveSession()
w1 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/15.0), size = [15, 30]))
b1 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [30]))
x1 = tf.constant(np.random.normal(loc=0.0, scale=1.0, size = [200, 15]))
y1 = tf.add(tf.matmul(x1, w1), b1)
y1_selu = selu(y1)
y1_selu_dropout = dropout_selu(y1_selu, 0.05, training=True)
w2 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/30.0), size = [30, 30]))
b2 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [30]))
x2 = y1_selu_dropout
y2 = tf.add(tf.matmul(x2, w2), b2)
y2_selu = selu(y2)
y2_selu_dropout = dropout_selu(y2_selu, 0.05, training=True)
w3 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/30.0), size = [30, 8]))
b3 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [8]))
x3 = y2_selu_dropout
y3 = tf.add(tf.matmul(x3, w3), b3)
y3_selu = selu(y3)
y3_selu_dropout = dropout_selu(y3_selu, 0.05, training=True)
#-------------------------#
# evaluate the network
x1_v, y1_selu_dropout_v, \
x2_v, y2_selu_dropout_v, \
x3_v, y3_selu_dropout_v, \
= sess.run([x1, y1_selu_dropout, x2, y2_selu_dropout, x3, y3_selu_dropout])
#-------------------------#
# print each layer's mean and standard deviation (1st line: input; 2nd line: output)
print("Layer 1")
print(np.mean(x1_v), np.std(x1_v))
print(np.mean(y1_selu_dropout_v), np.std(y1_selu_dropout_v))
print("Layer 2")
print(np.mean(x2_v), np.std(x2_v))
print(np.mean(y2_selu_dropout_v), np.std(y2_selu_dropout_v))
print("Layer 3")
print(np.mean(x3_v), np.std(x3_v))
print(np.mean(y3_selu_dropout_v), np.std(y3_selu_dropout_v))
Here is one possible output. Over 3 layers, the mean and standard deviation are still close to 0 and 1, respectively.
Layer 1
-0.0101213033749 1.01375071842
0.0106228883975 1.09375593322
Layer 2
0.0106228883975 1.09375593322
-0.027910206754 1.12216643393
Layer 3
-0.027910206754 1.12216643393
-0.131790078631 1.09698413493