Regularisers in Keras vs. Caffe - neural-network

I've edited an AlexNet in KERAS, trying to learn a single class (i.e. dogs/faces or not dog/faces).
I have training samples of correct images so the Ytrain is [1, 0], and incorrect images so the Ytrain is [0,1].
The goal is to know if the image is a dog (for example).
while training the loss was very big ~ 100,000,000
so i deleted the regularisers lines and it worked...
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2.0,l2=0.0)))
In Caffe the model works fine with these lines in the convolution layers:
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
Here is the model in Keras(image size 3,68,56):
Xtrain = self.Xtrain
Ytrain = self.Ytrain
batch_size = 10
nb_classes = 2
nb_epoch = 3
# input image dimensions
img_rows, img_cols = np.shape(self.Xtrain)[2], np.shape(self.Xtrain)[3]
#########################
### AlexNet - Start ##
#########################
print("Defining AlexNet...")
model = Sequential()
model.add(ZeroPadding2D((1,1),input_shape=(3,img_rows,img_cols)))
model.add(Convolution2D(96, 11, 11, border_mode='valid',
init='glorot_normal',
activation='relu',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2.0,l2=0.0)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(3, 3),strides=(2,2)))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, 5, 5, border_mode='valid',
init='glorot_normal',
activation='relu',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(3, 3),strides=(2,2)))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(384, 3, 3, border_mode='valid',
init='glorot_normal',
activation='relu',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(384, 3, 3, border_mode='valid',
init='glorot_normal',
activation='relu',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, 3, 3, border_mode='valid',
init='glorot_normal',
activation='relu',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
model.add(MaxPooling2D(pool_size=(3, 3),strides=(2,2)))
model.add(Flatten())
model.add(Dense(4096,
activation='relu',
init='glorot_normal',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
model.add(Dropout(0.5))
model.add(Dense(4096,
activation='relu',
init='glorot_normal',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
model.add(Dropout(0.5))
model.add(Dense(nb_classes,
activation='softmax',
activity_regularizer=ActivityRegularizer(l1=1, l2=1),
W_regularizer=WeightRegularizer(l1=2,l2=0)))
#######################
### AlexNet - End ##
#######################
add = Adadelta(lr=0.1, rho=0.95, epsilon=1e-06, decay=0.995)
print("Compiling AlexNet...")
model.compile(loss='categorical_crossentropy', optimizer=add)
print("Fitting AlexNet...")
model.fit(Xtrain, Ytrain, batch_size=batch_size, nb_epoch=nb_epoch,
show_accuracy=True, verbose=1, shuffle=True)
Y is that??

Related

TypeError: forward() missing 1 required positional argument: 'c'

I created this simplified version of VGG16:
class VGG16COMBO(nn.Module):
def __init__(self, num_classes):
super(VGG16COMBO, self).__init__()
# calculate same padding:
# (w - k + 2*p)/s + 1 = o
# => p = (s(o-1) - w + k)/2
self.block_1 = nn.Sequential(
nn.Conv2d(in_channels=1,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
# (1(32-1)- 32 + 3)/2 = 1
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_2 = nn.Sequential(
nn.Conv2d(in_channels=64,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_3 = nn.Sequential(
nn.Conv2d(in_channels=128,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_4 = nn.Sequential(
nn.Conv2d(in_channels=256,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.classifier = nn.Sequential(
nn.Linear(2048, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, num_classes),
)
def forward(self, m, c):
m = self.block_1(m)
m = self.block_2(m)
m = self.block_3(m)
m = self.block_4(m)
m = m.view(m.size(0), -1)
m = self.classifier(m)
c = self.block_1(c)
c = self.block_2(c)
c = self.block_3(c)
c = self.block_4(c)
c = c.view(c.size(0), -1)
c = self.classifier(c)
x = torch.cat((m, c), dim=1)
return x
You can see that in forward I pass 2 elements, m and c. m is referred to MNIST, and c to CIFAR10, because i want a multi input neural network (or a network with shared weights).
Then:
modelcombo = VGG16COMBO(1).cuda()
print(modelcombo)
# Define an optimizier
import torch.optim as optim
optimizer = optim.SGD(modelcombo.parameters(), lr = 0.01)
# Define a loss
criterion = nn.BCEWithLogitsLoss()
This is my training function:
#train da modificare con entrambi i dataset
def train(net, loaders, optimizer, criterion, epochs=20, dev=dev, save_param = False, model_name="valerio"):
try:
net = net.to(dev)
#print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Store the best val accuracy
best_val_accuracy = 0
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
if split == "train":
net.train()
else:
net.eval()
# Process each batch
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
#pred = pred.squeeze(dim=1) # Output shape is [Batch size, 1], but we want [Batch size]
labels = labels.unsqueeze(1)
labels = labels.float()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
#pred_labels = pred.argmax(1) + 1
pred_labels = (pred >= 0.5).long() # Binarize predictions to 0 and 1
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Store params at the best validation accuracy
if save_param and epoch_accuracy["val"] > best_val_accuracy:
#torch.save(net.state_dict(), f"{net.__class__.__name__}_best_val.pth")
torch.save(net.state_dict(), f"{model_name}_best_val.pth")
best_val_accuracy = epoch_accuracy["val"]
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
But when I do the training
# Train model
train(modelcombo, loaders, optimizer, criterion, epochs=10, dev=dev)
I obtain this error:
TypeError: forward() missing 1 required positional argument: 'c'
What I have to change, the net or the training function? I think the problem is in the training function, because I have to pass loaders, and loaders_cifar, but I don't know how. In particular, I have to cat loaders of mnist and loaders of cifar before passing them to the training function, or I have to modify the for (input, labels) in loaders[split]: in something like for (input, labels) in loaders[split] and loaders_cifar[split]:?
EDIT: I created this function:
def itr_merge(*itrs):
for itr in itrs:
for v in itr:
yield v
Edited the training function in this way:
#train da modificare con entrambi i dataset
def train2(net, loaders, loaders_cifar, optimizer, criterion, epochs=20, dev=dev, save_param = False, model_name="valerio"):
try:
net = net.to(dev)
#print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Store the best val accuracy
best_val_accuracy = 0
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
if split == "train":
net.train()
else:
net.eval()
# Process each batch
for x in itr_merge(loaders[split], loaders_cifar[split]):
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
#pred = pred.squeeze(dim=1) # Output shape is [Batch size, 1], but we want [Batch size]
labels = labels.unsqueeze(1)
labels = labels.float()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
#pred_labels = pred.argmax(1) + 1
pred_labels = (pred >= 0.5).long() # Binarize predictions to 0 and 1
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Store params at the best validation accuracy
if save_param and epoch_accuracy["val"] > best_val_accuracy:
#torch.save(net.state_dict(), f"{net.__class__.__name__}_best_val.pth")
torch.save(net.state_dict(), f"{model_name}_best_val.pth")
best_val_accuracy = epoch_accuracy["val"]
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
But I have still the same error
Yes if you have 2 inputs of data points , then pass 2 arguments here
pred = net(input1,input2) #input1 ---> mnist ,input2 ---> cifar

RuntimeError: Given groups=1, weight of size [64, 3, 3, 3], expected input[64, 1, 32, 32] to have 3 channels, but got 1 channels instead

I am working on SVHN dataset, and I got this error, only during training phase. During the instantiation of the model, it works.
RuntimeError: Given groups=1, weight of size [64, 3, 3, 3], expected input[64, 1, 32, 32] to have 3 channels, but got 1 channels instead
To be sure of having 3 channels, I wrote that Grayscale(3) transformation.
# Compose transformations
data_transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.Grayscale(num_output_channels=3),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
])
# Compose transformations
test_transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.Grayscale(num_output_channels=3),
transforms.ToTensor(),
])
# Load MNIST dataset with transforms
train_set_svhn = torchvision.datasets.SVHN(root=base_dir, split='train', download=True, transform=data_transform, target_transform=None)
test_set_svhn = torchvision.datasets.SVHN(root=base_dir, split='test', download=True, transform=test_transform)
class VGG16(nn.Module):
def __init__(self, num_classes):
super(VGG16, self).__init__()
# calculate same padding:
# (w - k + 2*p)/s + 1 = o
# => p = (s(o-1) - w + k)/2
self.block_1 = nn.Sequential(
nn.Conv2d(in_channels=3,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
# (1(32-1)- 32 + 3)/2 = 1
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_2 = nn.Sequential(
nn.Conv2d(in_channels=64,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_3 = nn.Sequential(
nn.Conv2d(in_channels=128,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.classifier = nn.Sequential(
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.25),
nn.Linear(4096, num_classes),
)
for m in self.modules():
if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
# nn.init.xavier_normal_(m.weight)
if m.bias is not None:
m.bias.detach().zero_()
# self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
def forward(self, x):
x = self.block_1(x)
x = self.block_2(x)
x = self.block_3(x)
# x = self.avgpool(x)
x = x.view(x.size(0), -1)
logits = self.classifier(x)
probas = F.softmax(logits, dim=1)
# probas = nn.Softmax(logits)
return probas
# return logits
I have no idea where that 1 comes from
Moreover, this is the shape of the output of the model without classifier (fully connected layers):
output = model1(test_x)
output.shape
torch.Size([1, 256, 4, 4])
And indeed I will pass 256x4x4 as input to the first FC.
Edit:
I encountered similar problems, but I did not use the MNIST dataset. I confirmed that I used the input file of (256256,3), and the following is my script file.
RuntimeError: Given groups=1, weight of size [64, 3, 4, 4], expected input[2, 2, 64, 64] to have 3 channels, but got 2 channels instead[enter image description here][1]

Binary classification on MNIST: loss and accuracies remain costant

I am trying to do binary classification on MNIST dataset. Class 0 for even numbers and class 1 for odd numbers. I am using a simplified version of VGG.
My NN has a loss and an accuracy that remain costant.
I want to say that my model, reached to over 90% of accuracy before of changing targets into binary targets, so probably there is something wrong.
Here I change the targets into binary:
for i in range(10):
idx = (train_set.targets==i)
if (i == 0) or ((i % 2) == 0): train_set.targets[idx] = 0
else: train_set.targets[idx] = 1
for i in range(10):
idx = (test_set.targets==i)
if (i == 0) or ((i % 2) == 0): test_set.targets[idx] = 0
else: test_set.targets[idx] = 1
This is my net:
class VGG16(nn.Module):
def __init__(self, num_classes):
super(VGG16, self).__init__()
# calculate same padding:
# (w - k + 2*p)/s + 1 = o
# => p = (s(o-1) - w + k)/2
self.block_1 = nn.Sequential(
nn.Conv2d(in_channels=1,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
# (1(32-1)- 32 + 3)/2 = 1
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_2 = nn.Sequential(
nn.Conv2d(in_channels=64,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,
out_channels=128,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_3 = nn.Sequential(
nn.Conv2d(in_channels=128,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(in_channels=256,
out_channels=256,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.block_4 = nn.Sequential(
nn.Conv2d(in_channels=256,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.Conv2d(in_channels=512,
out_channels=512,
kernel_size=(3, 3),
stride=(1, 1),
padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2),
stride=(2, 2))
)
self.classifier = nn.Sequential(
nn.Linear(2048, 4096),
nn.ReLU(True),
nn.Dropout(p=0.65),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(p=0.65),
nn.Linear(4096, num_classes),
nn.Sigmoid()
)
for m in self.modules():
if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
# nn.init.xavier_normal_(m.weight)
if m.bias is not None:
m.bias.detach().zero_()
# self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
def forward(self, x):
x = self.block_1(x)
x = self.block_2(x)
x = self.block_3(x)
x = self.block_4(x)
# x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
#logits = self.classifier(x)
#probas = F.softmax(logits, dim=1)
# probas = nn.Softmax(logits)
#return probas
# return logits
# Define an optimizier
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr = 0.01)
# Define a loss
criterion = nn.BCELoss()
def train(net, loaders, optimizer, criterion, epochs=20, dev=dev, save_param = False, model_name="valerio"):
try:
net = net.to(dev)
#print(net)
# Initialize history
history_loss = {"train": [], "val": [], "test": []}
history_accuracy = {"train": [], "val": [], "test": []}
# Store the best val accuracy
best_val_accuracy = 0
# Process each epoch
for epoch in range(epochs):
# Initialize epoch variables
sum_loss = {"train": 0, "val": 0, "test": 0}
sum_accuracy = {"train": 0, "val": 0, "test": 0}
# Process each split
for split in ["train", "val", "test"]:
if split == "train":
net.train()
else:
net.eval()
# Process each batch
for (input, labels) in loaders[split]:
# Move to CUDA
input = input.to(dev)
labels = labels.to(dev)
# Reset gradients
optimizer.zero_grad()
# Compute output
pred = net(input)
labels = labels.unsqueeze(1)
labels = labels.float()
loss = criterion(pred, labels)
# Update loss
sum_loss[split] += loss.item()
# Check parameter update
if split == "train":
# Compute gradients
loss.backward()
# Optimize
optimizer.step()
# Compute accuracy
_,pred_labels = pred.max(1)
batch_accuracy = (pred_labels == labels).sum().item()/input.size(0)
# Update accuracy
sum_accuracy[split] += batch_accuracy
# Compute epoch loss/accuracy
epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
# Store params at the best validation accuracy
if save_param and epoch_accuracy["val"] > best_val_accuracy:
#torch.save(net.state_dict(), f"{net.__class__.__name__}_best_val.pth")
torch.save(net.state_dict(), f"{model_name}_best_val.pth")
best_val_accuracy = epoch_accuracy["val"]
# Update history
for split in ["train", "val", "test"]:
history_loss[split].append(epoch_loss[split])
history_accuracy[split].append(epoch_accuracy[split])
# Print info
print(f"Epoch {epoch+1}:",
f"TrL={epoch_loss['train']:.4f},",
f"TrA={epoch_accuracy['train']:.4f},",
f"VL={epoch_loss['val']:.4f},",
f"VA={epoch_accuracy['val']:.4f},",
f"TeL={epoch_loss['test']:.4f},",
f"TeA={epoch_accuracy['test']:.4f},")
except KeyboardInterrupt:
print("Interrupted")
finally:
# Plot loss
plt.title("Loss")
for split in ["train", "val", "test"]:
plt.plot(history_loss[split], label=split)
plt.legend()
plt.show()
# Plot accuracy
plt.title("Accuracy")
for split in ["train", "val", "test"]:
plt.plot(history_accuracy[split], label=split)
plt.legend()
plt.show()
From the previous model of digit recognition i changed only the targets, and the final layer of classifier from 10 classes to 1 class + Sigmoid. And i changed also cross entropy to BCELoss. What I am doing wrong?
These are loss and accuracy values:
Epoch 1: TrL=49.0955, TrA=31.4211, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 2: TrL=49.0992, TrA=31.4235, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 3: TrL=49.0899, TrA=31.4176, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 4: TrL=49.0936, TrA=31.4199, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 5: TrL=49.0936, TrA=31.4199, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
Epoch 6: TrL=49.0825, TrA=31.4128, VL=49.7285, VA=31.7340, TeL=49.2635, TeA=31.3758,
What's wrong? How is it possible that with 10 classes I reached over 90% accuracy, and with a simplified version, only 2 classes, I reach 30% of accuracy?
Edit: increasing batch size from 64 to 128, accuracy reaches to 60% and remains constant...
In my opinion, the problem is different representations of odd and even numbers. Let's take 1, 3 pictures with this number are sundry, and convolution neural networks have a problem with extract features. The neural network has 90% accuracy, with 10 classes, so why you need to convert this into 2. If you know that number is 1, 3, 5, 7, 9, you know that it's odd.

How to construct a sobel filter for kernel initialization in input layer for images of size 128x128x3?

This is my code for sobel filter:
def init_f(shape, dtype=None):
sobel_x = tf.constant([[-5, -4, 0, 4, 5], [-8, -10, 0, 10, 8], [-10, -20, 0, 20, 10], [-8, -10, 0, 10, 8], [-5, -4, 0, 4, 5]])
ker = np.zeros(shape, dtype)
ker_shape = tf.shape(ker)
kernel = tf.tile(sobel_x, ker_shape)//*Is this correct?*
return kernel
model.add(Conv2D(filters=30, kernel_size=(5,5), kernel_initializer=init_f, strides=(1,1), activation='relu'))
So far I have managed to do this.
But, this gives me error:
Shape must be rank 2 but is rank 4 for 'conv2d_17/Tile' (op: 'Tile') with input shapes: [5,5], [4].
Tensorflow Version: 2.1.0
You're close, but the args to tile don't appear to be correct. That is why you're getting the error "Shape must be rank 2 but is rank 4 for..." You're sobel_x must be a rank 4 tensor, so you need to add two more dimensions. I used reshape in this example.
from tensorflow import keras
import tensorflow as tf
import numpy
def kernelInitializer(shape, dtype=None):
print(shape)
sobel_x = tf.constant(
[
[-5, -4, 0, 4, 5],
[-8, -10, 0, 10, 8],
[-10, -20, 0, 20, 10],
[-8, -10, 0, 10, 8],
[-5, -4, 0, 4, 5]
], dtype=dtype )
#create the missing dims.
sobel_x = tf.reshape(sobel_x, (5, 5, 1, 1))
print(tf.shape(sobel_x))
#tile the last 2 axis to get the expected dims.
sobel_x = tf.tile(sobel_x, (1, 1, shape[-2],shape[-1]))
print(tf.shape(sobel_x))
return sobel_x
x1 = keras.layers.Input((128, 128, 3))
cvl = keras.layers.Conv2D(30, kernel_size=(5,5), kernel_initializer=kernelInitializer, strides=(2,2), activation='relu')
model = keras.Sequential();
model.add(x1)
model.add(cvl)
data = numpy.ones((1, 128, 128, 3))
data[:, 0:64, 0:64, :] = 0
pd = model.predict(data)
print(pd.shape)
d = pd[0, :, :, 0]
for row in d:
for col in row:
m = '0'
if col != 0:
m = 'X'
print(m, end="")
print("")
I looked at using expand_dims instead of reshape but there didn't appear any advantage. broadcast_to seems ideal, but you still have to add the dimensions, so I don't think it was better than tile.
Why 30 filters of the same filter though? Are they going to be changed afterwards?

simple keras nn does not predict well

the code:
x1 = np.array([1, 10])
x2 = np.array([7, 4])
x3 = np.array([8, 7])
x4 = np.array([1, 15])
x5 = np.array([4, 4])
X = np.array([x1, x2, x3, x4, x5])
X = X / 100
Y = np.array([4, 8, 7, 5, 1])
Y = Y / 100
model = Sequential()
model.add(Dense(4, input_dim=2, activation='sigmoid', kernel_initializer="uniform"))
model.add(Dense(2, activation='sigmoid', kernel_initializer="uniform"))
model.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, Y, epochs=500, batch_size=3)
toPred = np.array([x1]) / 100
print(model.predict(toPred) * 100)
For everything I predict I get a strange result, all the predictions are almost the same and are not close to the real value.
Suggestions?
Try this sample instead, I didn't change much just a different approach to scaling, and a longer training time.
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
x1 = np.array([1, 10])
x2 = np.array([7, 4])
x3 = np.array([8, 7])
x4 = np.array([1, 15])
x5 = np.array([4, 4])
X = np.array([x1, x2, x3, x4, x5])
# Scale to range 0-1 since input activation is a sigmoid
X = (X - X.std()) / X.mean()
#Dont need to scale Y, leaves us with one less unnecessary operation
Y = np.array([4, 8, 7, 5, 1])
model = Sequential()
model.add(Dense(4, input_dim=2, activation='sigmoid', kernel_initializer="uniform"))
model.add(Dense(2, activation='sigmoid', kernel_initializer="uniform"))
#Set output activation to linear
model.add(Dense(1, activation='linear', kernel_initializer="uniform"))
model.compile(loss='mean_squared_error', optimizer='adam')
#Train for 5k epochs, since the loss keeps decreasing
model.fit(X, Y, epochs=5000, batch_size=5)
print(model.predict(X))
gives me
[[ 3.50988507]
[ 7.0278182 ]
[ 7.61787605]
[ 5.38016272]
[ 1.63140726]]
Sometimes you just need to tinker with the hyper-parameters. You could probably eliminate the second dense layer since this data is small and I also get better results using the 'SGD' (stochastic gradient descent) optimizer. You can also get good results faster by turning up the learning rate (may only work well for this snippet). So just play around until you get result your looking for. Hope this helps :)
from keras.optimizers import SGD
opt = SGD(lr=.05)
model.compile(loss='mean_squared_error', optimizer=opt)
model.fit(X, Y, epochs=1000, batch_size=5)