Using SGD on MNIST dataset with Pytorch, loss not decreasing - neural-network

I tried to use SGD on MNIST dataset with batch size of 32, but the loss does not decrease at all.
I checked my model, loss function and read documentation but couldn't figure out what I've done wrong.
I defined my neural network as below
class classification(nn.Module):
def __init__(self):
super(classification, self).__init__()
# construct layers for a neural network
self.classifier1 = nn.Sequential(
nn.Linear(in_features=28*28, out_features=20*20),
nn.Sigmoid(),
)
self.classifier2 = nn.Sequential(
nn.Linear(in_features=20*20, out_features=10*10),
nn.Sigmoid(),
)
self.classifier3 = nn.Sequential(
nn.Linear(in_features=10*10, out_features=10),
nn.LogSoftmax(dim=1),
)
def forward(self, inputs): # [batchSize, 1, 28, 28]
x = inputs.view(inputs.size(0), -1) # [batchSize, 28*28]
x = self.classifier1(x) # [batchSize, 20*20]
x = self.classifier2(x) # [batchSize, 10*10]
out = self.classifier3(x) # [batchSize, 10]
return out
And I defined my training process as below
classifier = classification().to("cuda")
#optimizer
optimizer = torch.optim.SGD(classifier.parameters(), lr=learning_rate_value)
#loss function
criterion = nn.NLLLoss()
batch_size=32
epoch = 30
#array to save loss history
loss_train_arr=np.zeros(epoch)
#used DataLoader to make split batch
batched_train = torch.utils.data.DataLoader(training_set, batch_size, shuffle=True)
for i in range(epoch):
loss_train=0
#train and compute loss, accuracy
for img, label in batched_train:
img=img.to(device)
label=label.to(device)
optimizer.zero_grad()
predicted = classifier(img)
label_predicted = torch.argmax(predicted,dim=1)
loss = criterion(predicted, label)
loss.backward
optimizer.step()
loss_train += loss.item()
loss_train_arr[i]=loss_train/(len(batched_train.dataset)/batch_size)
I am using a model with LogSoftmax layer, so my loss function seems right. But the loss does not decrease at all.

If the code you posted is the exact code you use, the problem is that you don't actually call backward on the loss (missing parentheses ()).

Related

Cross Entropy Loss function not converging

I'm new to neural networks and I'm building one that reads handwritten numbers. The losses float around 2.2-2.3 and I'm not sure why it's not converging. I tried messing with the learning rate but it doesn't really do anything.
# TODO: Define function to create our own neural network
# Parameters
input_size = 784 # Hint: image size is 28x28, and we want to flatten the image
num_classes = 10 # Hint: our inputs include 0-9
num_epochs = 5 # Number of times we loop through the entire training dataset, can be pretty arbitrary
class NN(nn.Module):
############ YOUR CODE STARTS HERE ############
# 1. Initialize our own NN model
def __init__(self, input_size, num_classes):
super(NN, self).__init__()
self.flatten = nn.Flatten()
# Use ReLU activation function
self.relu = nn.ReLU()
# Input layer
self.input_layer = nn.Linear(input_size, 13)
# Hidden layers: use at least 1 hidden layer!
self.hidden1 = nn.Linear(13, 6)
# Output layer
self.output_layer = nn.Linear(6, 10)
# 2. Define method for forwarding input data
def forward(self, sample):
sample = self.flatten(sample)
out = self.input_layer(sample)
out = self.relu(out)
out = self.hidden1(out)
out = self.relu(out) #TODO: activation function
out = self.output_layer(out) #TODO: forward to output layer
return out
nn_model = NN(input_size, num_classes)
print("My NN Model: ", nn_model)
Loss function:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=.0001)
Training:
total_steps = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Reshape our images from 2D(28x28) to 1D(784)
images = images.view(-1, 28*28).to(device)
labels = labels.to(device)
# Call functions we've previously defined to perform forward pass & calculate loss
output = nn_model.forward(images)
loss = loss_function(output, labels)
# Backward pass
optimizer.zero_grad()
loss.backward()
# calculates gradients
optimizer.step()
# Print out training process
if (i+1) % 100 == 0:
print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{total_steps}, loss = {loss.item():.4f}')
Any help is appreciated!

Training a NN on top of cached embeddings from a pre-trained model, loss not going down?

I have some embeddings, which are the output of a pre-trained model, saved to disk. I am trying to perform a binary classification task of accept/reject. I have trained a simple neural network to perform classification, however, I am not seeing any decrease in the loss after some time.
Here is my NN, the cached embeddings are of shape 512:
from transformers.modeling_outputs import SequenceClassifierOutput
class ClassNet(nn.Module):
def __init__(self, num_labels=2):
super(ClassNet, self).__init__()
self.num_labels = num_labels
self.classifier = nn.Sequential(
nn.Linear(512, 256, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=.5, inplace=False),
nn.Linear(256, 128, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=.5, inplace=False),
nn.Linear(128, num_labels, bias=True)
)
def forward(self, inputs):
return self.classifier(inputs)
This is some random architechture that I am trying to over-fit to, but it seems that the network plateau's quickly on the training data. Could it be that my data is too complicated?
here's my training loop:
optimizer = optim.Adam(model.parameters(), lr=1e-4,weight_decay=5e-3) # L2 regularization
loss_fct=nn.CrossEntropyLoss()
model.train()
for epoch in range(10): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(train_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data['embeddings'], data['labels']
# zero the parameter gradients
optimizer.zero_grad()
outputs = model(inputs)
logits = outputs.squeeze(1)
loss = loss_fct(logits, labels.squeeze())
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')
The loss is stuck at around .4 and doesn't really decrease at all after an epoch.
To give a little context, the pre-trained embeddings are the output of specially trained ViT model from HuggingFace, I am trying to perform a classification task directly on the outputs of that model by building a simple neural network on top of it.
Can anyone advise on what is going wrong? Also if anyone has any suggestions to get a better accuracy, I would love to hear it.

ValueError: Expected input batch_size (24) to match target batch_size (8)

Got many links to solve this read different stackoverflow answer related to this but not able to figure it out .
My image size is torch.Size([8, 3, 16, 16]).
My architechture is as below
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# linear layer (784 -> 1 hidden node)
self.fc1 = nn.Linear(16 * 16, 768)
self.fc2 = nn.Linear(768, 64)
self.fc3 = nn.Linear(64, 10)
self.dropout = nn.Dropout(p=.5)
def forward(self, x):
# flatten image input
x = x.view(-1, 16 * 16)
# add hidden layer, with relu activation function
x = self.dropout(F.relu(self.fc1(x)))
x = self.dropout(F.relu(self.fc2(x)))
x = F.log_softmax(self.fc3(x), dim=1)
return x
# specify loss function
criterion = nn.NLLLoss()
# specify optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=.003)
# number of epochs to train the model
n_epochs = 30 # suggest training between 20-50 epochs
model.train() # prep model for training
for epoch in range(n_epochs):
# monitor training loss
train_loss = 0.0
###################
# train the model #
###################
for data, target in trainloader:
# clear the gradients of all optimized variables
optimizer.zero_grad()
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
loss.backward()
# perform a single optimization step (parameter update)
optimizer.step()
# update running training loss
train_loss += loss.item()*data.size(0)
# print training statistics
# calculate average loss over an epoch
train_loss = train_loss/len(trainloader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f}'.format(
epoch+1,
train_loss
))
i am getting value error as
ValueError: Expected input batch_size (24) to match target batch_size (8).
how to fix it . My batch size is 8 and input image size is (16*16).And i have 10 class classification here .
Your input images have 3 channels, therefore your input feature size is 16*16*3, not 16*16. Currently, you consider each channel as separate instances, leading to a classifier output - after x.view(-1, 16*16) flattening - of (24, 16*16). Clearly, the batch size doesn't match because it is supposed to be 8, not 8*3 = 24.
You could either:
Switch to a CNN to handle multi-channel inputs (here 3 channels).
Use a self.fc1 with 16*16*3 input features.
If the input is RGB, maybe even convert to 1-channel grayscale map.

Pytorch model 2D regression given an scalar input

I want to create a model to perform this regression:
My dataset looks like:
t,x,y
0.0,-,0.5759052335487023
0.01,-,-
0.02,1.1159124144549086,-
0.03,-,-
0.04,1.0054825084650338,0.4775267298487888
0.05,-,-
I'm having some troubles with loss, dataset load, batch_size, and Net structure (I add one single layer to simplify the problem)
Thats my code:
Net:
class Net(nn.Module):
'''Model to regress 2d time series values given scalar input.'''
def __init__(self):
super(Net, self).__init__()
#Layers
self.predict = nn.Linear(1, 2)
def forward(self, x):
x = self.predict(x)
return x
Dataset load
class TimeSeriesDataset(torch.utils.data.Dataset):
def __init__(self, csv_file):
#Load the dataset
#Load the csv file as a dataframe
df = pd.read_csv(csv_file, header=0, na_values='-')
#Store the inputs and outputs
self.x = df.values[:,:-2].astype('float32')
self.y = df.values[:,1:].astype('float32')
#Ensure target has the right shape
self.y = self.y.reshape((len(self.y),2))
def __len__(self):
#Return the number of rows in the dataset
return len(self.x)
def __getitem__(self, idx):
#Return a row at an index
return [self.x[idx], self.y[idx]]
Trainloader, loss, optimizer
dataset = TimeSeriesDataset('data.csv')
trainloader = torch.utils.data.DataLoader(
dataset, batch_size=32, shuffle=True, num_workers=2)
def lossFunc(outputs, labels):
# nn.MSELoss() #Mean Squared Error, works fine with regression problems and with small numbers (x-y)^2
return torch.mean((outputs-labels)**2)
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
print(net)
Trainning:
for epoch in range(300):
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# TODO get the data
# inputs, labels
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
#print("Inputs", inputs)
#print("labels", labels)
#print("outputs", outputs)
loss = lossFunc(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 20 == 19: # print every 20 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 20))
running_loss = 0.0
print('Finished Training')
Outputs looks this way:
tensor([[nan, nan],
[nan, nan],
[nan, nan],
...
And when I execute the 300 epochs error value doesn't change and prints nan
After the line loss = loss(outputs, labels), loss is now a tensor, not a function anymore. Python does not allow you to have distinct objects with identical names.
So after the first call, loss has become a tensor, and as the error says "tensors are not callable", so the second call fails

Cannot make this autoencoder network function properly (with convolutional and maxpool layers)

Autoencoder networks seems to be way trickier than normal classifier MLP networks. After several attempts using Lasagne all what I get in the reconstructed output is something that resembles at its best a blurry averaging of all the images of the MNIST database without distinction on what the input digit actually is.
The networks structure I chose are the following cascade layers:
input layer (28x28)
2D convolutional layer, filter size 7x7
Max Pooling layer, size 3x3, stride 2x2
Dense (fully connected) flattening layer, 10 units (this is the bottleneck)
Dense (fully connected) layer, 121 units
Reshaping layer to 11x11
2D convolutional layer, filter size 3x3
2D Upscaling layer factor 2
2D convolutional layer, filter size 3x3
2D Upscaling layer factor 2
2D convolutional layer, filter size 5x5
Feature max pooling (from 31x28x28 to 28x28)
All the 2D convolutional layers have the biases untied, sigmoid activations and 31 filters.
All the fully connected layers have sigmoid activations.
The loss function used is squared error, the updating function is adagrad. The length of the chunk for the learning is 100 samples, multiplied for 1000 epochs.
Just for completeness, the following is the code I used:
import theano.tensor as T
import theano
import sys
sys.path.insert(0,'./Lasagne') # local checkout of Lasagne
import lasagne
from theano import pp
from theano import function
import gzip
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
def load_mnist():
def load_mnist_images(filename):
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
# The inputs are vectors now, we reshape them to monochrome 2D images,
# following the shape convention: (examples, channels, rows, columns)
data = data.reshape(-1, 1, 28, 28)
# The inputs come as bytes, we convert them to float32 in range [0,1].
# (Actually to range [0, 255/256], for compatibility to the version
# provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
return data / np.float32(256)
def load_mnist_labels(filename):
# Read the labels in Yann LeCun's binary format.
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
# The labels are vectors of integers now, that's exactly what we want.
return data
X_train = load_mnist_images('train-images-idx3-ubyte.gz')
y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
return X_train, y_train, X_test, y_test
def plot_filters(conv_layer):
W = conv_layer.get_params()[0]
W_fn = theano.function([],W)
params = W_fn()
ks = np.squeeze(params)
kstack = np.vstack(ks)
plt.imshow(kstack,interpolation='none')
plt.show()
def main():
#theano.config.exception_verbosity="high"
#theano.config.optimizer='None'
X_train, y_train, X_test, y_test = load_mnist()
ohe = OneHotEncoder()
y_train = ohe.fit_transform(np.expand_dims(y_train,1)).toarray()
chunk_len = 100
visamount = 10
num_epochs = 1000
num_filters=31
dropout_p=.0
print "X_train.shape",X_train.shape,"y_train.shape",y_train.shape
input_var = T.tensor4('X')
output_var = T.tensor4('X')
conv_nonlinearity = lasagne.nonlinearities.sigmoid
net = lasagne.layers.InputLayer((chunk_len,1,28,28), input_var)
conv1 = net = lasagne.layers.Conv2DLayer(net,num_filters,(7,7),nonlinearity=conv_nonlinearity,untie_biases=True)
net = lasagne.layers.MaxPool2DLayer(net,(3,3),stride=(2,2))
net = lasagne.layers.DropoutLayer(net,p=dropout_p)
#conv2_layer = lasagne.layers.Conv2DLayer(dropout_layer,num_filters,(3,3),nonlinearity=conv_nonlinearity)
#pool2_layer = lasagne.layers.MaxPool2DLayer(conv2_layer,(3,3),stride=(2,2))
net = lasagne.layers.DenseLayer(net,10,nonlinearity=lasagne.nonlinearities.sigmoid)
#augment_layer1 = lasagne.layers.DenseLayer(reduction_layer,33,nonlinearity=lasagne.nonlinearities.sigmoid)
net = lasagne.layers.DenseLayer(net,121,nonlinearity=lasagne.nonlinearities.sigmoid)
net = lasagne.layers.ReshapeLayer(net,(chunk_len,1,11,11))
net = lasagne.layers.Conv2DLayer(net,num_filters,(3,3),nonlinearity=conv_nonlinearity,untie_biases=True)
net = lasagne.layers.Upscale2DLayer(net,2)
net = lasagne.layers.Conv2DLayer(net,num_filters,(3,3),nonlinearity=conv_nonlinearity,untie_biases=True)
#pool_after0 = lasagne.layers.MaxPool2DLayer(conv_after1,(3,3),stride=(2,2))
net = lasagne.layers.Upscale2DLayer(net,2)
net = lasagne.layers.DropoutLayer(net,p=dropout_p)
#conv_after2 = lasagne.layers.Conv2DLayer(upscale_layer1,num_filters,(3,3),nonlinearity=conv_nonlinearity,untie_biases=True)
#pool_after1 = lasagne.layers.MaxPool2DLayer(conv_after2,(3,3),stride=(1,1))
#upscale_layer2 = lasagne.layers.Upscale2DLayer(pool_after1,4)
net = lasagne.layers.Conv2DLayer(net,num_filters,(5,5),nonlinearity=conv_nonlinearity,untie_biases=True)
net = lasagne.layers.FeaturePoolLayer(net,num_filters,pool_function=theano.tensor.max)
print "output_shape:",lasagne.layers.get_output_shape(net)
params = lasagne.layers.get_all_params(net, trainable=True)
prediction = lasagne.layers.get_output(net)
loss = lasagne.objectives.squared_error(prediction, output_var)
#loss = lasagne.objectives.binary_crossentropy(prediction, output_var)
aggregated_loss = lasagne.objectives.aggregate(loss)
updates = lasagne.updates.adagrad(aggregated_loss,params)
train_fn = theano.function([input_var, output_var], loss, updates=updates)
test_prediction = lasagne.layers.get_output(net, deterministic=True)
predict_fn = theano.function([input_var], test_prediction)
print "starting training..."
for epoch in range(num_epochs):
selected = list(set(np.random.random_integers(0,59999,chunk_len*4)))[:chunk_len]
X_train_sub = X_train[selected,:]
_loss = train_fn(X_train_sub, X_train_sub)
print("Epoch %d: Loss %g" % (epoch + 1, np.sum(_loss) / len(X_train)))
"""
chunk = X_train[0:chunk_len,:,:,:]
result = predict_fn(chunk)
vis1 = np.hstack([chunk[j,0,:,:] for j in range(visamount)])
vis2 = np.hstack([result[j,0,:,:] for j in range(visamount)])
plt.imshow(np.vstack([vis1,vis2]))
plt.show()
"""
print "done."
chunk = X_train[0:chunk_len,:,:,:]
result = predict_fn(chunk)
print "chunk.shape",chunk.shape
print "result.shape",result.shape
plot_filters(conv1)
for i in range(chunk_len/visamount):
vis1 = np.hstack([chunk[i*visamount+j,0,:,:] for j in range(visamount)])
vis2 = np.hstack([result[i*visamount+j,0,:,:] for j in range(visamount)])
plt.imshow(np.vstack([vis1,vis2]))
plt.show()
import ipdb; ipdb.set_trace()
if __name__ == "__main__":
main()
Any ideas on how to improve this network to get a reasonably functioning autoencoder?