Cannot make this autoencoder network function properly (with convolutional and maxpool layers) - neural-network

Autoencoder networks seems to be way trickier than normal classifier MLP networks. After several attempts using Lasagne all what I get in the reconstructed output is something that resembles at its best a blurry averaging of all the images of the MNIST database without distinction on what the input digit actually is.
The networks structure I chose are the following cascade layers:
input layer (28x28)
2D convolutional layer, filter size 7x7
Max Pooling layer, size 3x3, stride 2x2
Dense (fully connected) flattening layer, 10 units (this is the bottleneck)
Dense (fully connected) layer, 121 units
Reshaping layer to 11x11
2D convolutional layer, filter size 3x3
2D Upscaling layer factor 2
2D convolutional layer, filter size 3x3
2D Upscaling layer factor 2
2D convolutional layer, filter size 5x5
Feature max pooling (from 31x28x28 to 28x28)
All the 2D convolutional layers have the biases untied, sigmoid activations and 31 filters.
All the fully connected layers have sigmoid activations.
The loss function used is squared error, the updating function is adagrad. The length of the chunk for the learning is 100 samples, multiplied for 1000 epochs.
Just for completeness, the following is the code I used:
import theano.tensor as T
import theano
import sys
sys.path.insert(0,'./Lasagne') # local checkout of Lasagne
import lasagne
from theano import pp
from theano import function
import gzip
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
def load_mnist():
def load_mnist_images(filename):
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
# The inputs are vectors now, we reshape them to monochrome 2D images,
# following the shape convention: (examples, channels, rows, columns)
data = data.reshape(-1, 1, 28, 28)
# The inputs come as bytes, we convert them to float32 in range [0,1].
# (Actually to range [0, 255/256], for compatibility to the version
# provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
return data / np.float32(256)
def load_mnist_labels(filename):
# Read the labels in Yann LeCun's binary format.
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
# The labels are vectors of integers now, that's exactly what we want.
return data
X_train = load_mnist_images('train-images-idx3-ubyte.gz')
y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
return X_train, y_train, X_test, y_test
def plot_filters(conv_layer):
W = conv_layer.get_params()[0]
W_fn = theano.function([],W)
params = W_fn()
ks = np.squeeze(params)
kstack = np.vstack(ks)
plt.imshow(kstack,interpolation='none')
plt.show()
def main():
#theano.config.exception_verbosity="high"
#theano.config.optimizer='None'
X_train, y_train, X_test, y_test = load_mnist()
ohe = OneHotEncoder()
y_train = ohe.fit_transform(np.expand_dims(y_train,1)).toarray()
chunk_len = 100
visamount = 10
num_epochs = 1000
num_filters=31
dropout_p=.0
print "X_train.shape",X_train.shape,"y_train.shape",y_train.shape
input_var = T.tensor4('X')
output_var = T.tensor4('X')
conv_nonlinearity = lasagne.nonlinearities.sigmoid
net = lasagne.layers.InputLayer((chunk_len,1,28,28), input_var)
conv1 = net = lasagne.layers.Conv2DLayer(net,num_filters,(7,7),nonlinearity=conv_nonlinearity,untie_biases=True)
net = lasagne.layers.MaxPool2DLayer(net,(3,3),stride=(2,2))
net = lasagne.layers.DropoutLayer(net,p=dropout_p)
#conv2_layer = lasagne.layers.Conv2DLayer(dropout_layer,num_filters,(3,3),nonlinearity=conv_nonlinearity)
#pool2_layer = lasagne.layers.MaxPool2DLayer(conv2_layer,(3,3),stride=(2,2))
net = lasagne.layers.DenseLayer(net,10,nonlinearity=lasagne.nonlinearities.sigmoid)
#augment_layer1 = lasagne.layers.DenseLayer(reduction_layer,33,nonlinearity=lasagne.nonlinearities.sigmoid)
net = lasagne.layers.DenseLayer(net,121,nonlinearity=lasagne.nonlinearities.sigmoid)
net = lasagne.layers.ReshapeLayer(net,(chunk_len,1,11,11))
net = lasagne.layers.Conv2DLayer(net,num_filters,(3,3),nonlinearity=conv_nonlinearity,untie_biases=True)
net = lasagne.layers.Upscale2DLayer(net,2)
net = lasagne.layers.Conv2DLayer(net,num_filters,(3,3),nonlinearity=conv_nonlinearity,untie_biases=True)
#pool_after0 = lasagne.layers.MaxPool2DLayer(conv_after1,(3,3),stride=(2,2))
net = lasagne.layers.Upscale2DLayer(net,2)
net = lasagne.layers.DropoutLayer(net,p=dropout_p)
#conv_after2 = lasagne.layers.Conv2DLayer(upscale_layer1,num_filters,(3,3),nonlinearity=conv_nonlinearity,untie_biases=True)
#pool_after1 = lasagne.layers.MaxPool2DLayer(conv_after2,(3,3),stride=(1,1))
#upscale_layer2 = lasagne.layers.Upscale2DLayer(pool_after1,4)
net = lasagne.layers.Conv2DLayer(net,num_filters,(5,5),nonlinearity=conv_nonlinearity,untie_biases=True)
net = lasagne.layers.FeaturePoolLayer(net,num_filters,pool_function=theano.tensor.max)
print "output_shape:",lasagne.layers.get_output_shape(net)
params = lasagne.layers.get_all_params(net, trainable=True)
prediction = lasagne.layers.get_output(net)
loss = lasagne.objectives.squared_error(prediction, output_var)
#loss = lasagne.objectives.binary_crossentropy(prediction, output_var)
aggregated_loss = lasagne.objectives.aggregate(loss)
updates = lasagne.updates.adagrad(aggregated_loss,params)
train_fn = theano.function([input_var, output_var], loss, updates=updates)
test_prediction = lasagne.layers.get_output(net, deterministic=True)
predict_fn = theano.function([input_var], test_prediction)
print "starting training..."
for epoch in range(num_epochs):
selected = list(set(np.random.random_integers(0,59999,chunk_len*4)))[:chunk_len]
X_train_sub = X_train[selected,:]
_loss = train_fn(X_train_sub, X_train_sub)
print("Epoch %d: Loss %g" % (epoch + 1, np.sum(_loss) / len(X_train)))
"""
chunk = X_train[0:chunk_len,:,:,:]
result = predict_fn(chunk)
vis1 = np.hstack([chunk[j,0,:,:] for j in range(visamount)])
vis2 = np.hstack([result[j,0,:,:] for j in range(visamount)])
plt.imshow(np.vstack([vis1,vis2]))
plt.show()
"""
print "done."
chunk = X_train[0:chunk_len,:,:,:]
result = predict_fn(chunk)
print "chunk.shape",chunk.shape
print "result.shape",result.shape
plot_filters(conv1)
for i in range(chunk_len/visamount):
vis1 = np.hstack([chunk[i*visamount+j,0,:,:] for j in range(visamount)])
vis2 = np.hstack([result[i*visamount+j,0,:,:] for j in range(visamount)])
plt.imshow(np.vstack([vis1,vis2]))
plt.show()
import ipdb; ipdb.set_trace()
if __name__ == "__main__":
main()
Any ideas on how to improve this network to get a reasonably functioning autoencoder?

Related

Seemingly inconsistent tensor sizes in pytorch

I'm building a convolutional autoencoder, but want the encoding to be in a linear form so I can more easily feed it as input into an MLP. I have two convolutional layers on the encoder along with a linear inner layer to reduce dimension. This encoding is then fed into the corresponding decoder.
When I flatten the output of the second convolutional layer, based on my calculation (using the standard formula: Calculate the Output size in Convolution layer) should come out to a 1x100352 rank 1 tensor. However, when I set the input dimension of the linear layer to be 100352, the flattened rank 1 tensor has dimension 1x50176. Then comes the weird part.
I tried changing the input dimension of the linear layer to be 50176, assuming I had miscalculated. When I do this, the reshaped rank 1 tensor confusingly becomes 1x100352, and then the aforementioned weight matrix becomes 50176x256 as expected.
This response to modifying the linear layer's input dimension doesn't make sense to me. That hyperparameter controls the weight matrix correctly, but I guess I'm uncertain why it has any bearing on the linear layer's input since that's just a reshaped tensor output from a convolutional layer whose hyperparameters are unrelated to the hyperparameter in question.
I apologize if I'm just missing something obvious. I'm very new to pytorch, and I couldn't find any other posts which discussed this sort of issue.
Here's what I believe to be the minimal reproducible example:
import os
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
from torch.utils.data import DataLoader
from torchvision.utils import save_image
class convAutoEncoder(nn.Module):
def __init__(self,**kwargs):
super().__init__()
#Creating network structure
#Encoder portion of autoencoder
self.enc1 = nn.Conv2d(in_channels = kwargs["inputChannels"], out_channels = kwargs["channelsEncoderMid"], kernel_size = kwargs["kernelSize"])
self.enc2 = nn.Conv2d(in_channels = kwargs["channelsEncoderMid"], out_channels = kwargs["channelsEncoderInner"], kernel_size = kwargs["kernelSize"])
self.enc3 = nn.Linear(in_features = kwargs["intoLinear"], out_features = kwargs["linearEncoded"])
#Decoder portion of autoencoder
self.dec1 = nn.Linear(in_features = kwargs["linearEncoded"], out_features = kwargs["intoLinear"])
self.dec2 = nn.ConvTranspose2d(in_channels = kwargs["channelsEncoderInner"], out_channels = kwargs["channelsDecoderMid"], kernel_size = kwargs["kernelSize"])
self.dec3 = nn.ConvTranspose2d(in_channels = kwargs["channelsDecoderMid"], out_channels = kwargs["inputChannels"], kernel_size = kwargs["kernelSize"])
def forward(self,x):
#Encoding
x = F.relu(self.enc1(x))
x = F.relu(self.enc2(x))
x = x.reshape(1,-1)
x = x.squeeze()
x = F.relu(self.enc3(x))
#Decoding
x = F.relu(self.dec1(x))
x = x.reshape([32,4,28,28])
x = F.relu(self.dec2(x))
x = F.relu(self.dec3(x))
return x
def encodeDecodeConv(numEpochs = 20, input_Channels = 3, batchSize = 32,
channels_Encoder_Inner = 4, channels_Encoder_Mid = 8, into_Linear = 100352,
linear_Encoded = 256, channels_Decoder_Mid = 8, kernel_Size = 3,
learningRate = 1e-3):
#Pick a device. If GPU available, use that. Otherwise, use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#Define data transforms
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
#Define training dataset
trainSet = datasets.CIFAR10(root = './data', train = True, download = True, transform = transform)
#Define testing dataset
testSet = datasets.CIFAR10(root = './data', train = False, download = True, transform = transform)
#Define data loaders
trainLoader = DataLoader(trainSet, batch_size = batchSize, shuffle = True)
testLoader = DataLoader(testSet, batch_size = batchSize, shuffle = True)
#Initialize neural network
model = convAutoEncoder(inputChannels = input_Channels, channelsEncoderMid = channels_Encoder_Mid, channelsEncoderInner = channels_Encoder_Inner, intoLinear = into_Linear, linearEncoded = linear_Encoded, channelsDecoderMid = channels_Decoder_Mid, kernelSize = kernel_Size)
#Optimization setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr = learningRate)
lossTracker = []
for epoch in range(numEpochs):
loss = 0
for data,_ in trainLoader:
data = data.to(device)
optimizer.zero_grad()
outputs = model(data)
train_loss = criterion(outputs,data)
train_loss.backward()
optimizer.step()
loss += train_loss.item()
loss = loss/len(trainLoader)
print('Epoch {} of {}, Train loss: {:.3f}'.format(epoch+1,numEpochs,loss))
encodeDecodeConv()
Edit2: Somewhere in the CIFAR10 dataset, the data appears to change dimension. After playing around with print statements more, I discovered that setting the relevant hyperparameter to 100352 works great for many entries, but then seemingly one image pops up that has a different size. Not sure why that would occur, though.

Tensorflow - keras: bad performance for simple curve fitting task

I'm trying to implement a very simple one layered MLP for a toy regression problem with one variable (dimension = 1) and one target (dimension = 1). It's a simple curve fitting problem with zero noise.
Matlab\Deep Learning Toolbox
Using levenberg-marquardt backpropagation on a MLP with a single hidden layer with 100 neurons and hyperbolic tangent activation I got pretty decent performance with almost zero effort:
MSE = 7.18e-08
Plotting the predictions and the targets I get a very precise fitting.
Python\Tensorflow\Keras
With the same network settings I used in matlab there's almost no training. No matter how hard I try to tune the training parameters or switch the optimizer.
MSE = 0.12900154
In this case the plot of the predictions is a curve that is not even able to follow the oscillations of the target curve.
I can obtain something better using RELU activations for the hidden layer but we're still far:
MSE = 0.0582045
This is the code I used in Python:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
# IMPORT DATASET FROM CSV FILE, SHUFFLE TRAINING SET
# AND MAKE NUMPY ARRAY FOR TRAINING (DATA ARE ALREADY NORMALIZED)
dataset_path = "C:/Users/Rob/Desktop/Learning1.csv"
Learning_Dataset = pd.read_csv(dataset_path
, comment='\t',sep=","
,skipinitialspace=False)
Learning_Dataset = Learning_Dataset.sample(frac = 1) # SHUFFLING
test_dataset_path = "C:/Users/Rob/Desktop/Test1.csv"
Test_Dataset = pd.read_csv(test_dataset_path
, comment='\t',sep=","
,skipinitialspace=False)
Learning_Target = Learning_Dataset.pop('Target')
Test_Target = Test_Dataset.pop('Target')
Learning_Dataset = np.array(Learning_Dataset,dtype = "float32")
Test_Dataset = np.array(Test_Dataset,dtype = "float32")
Learning_Target = np.array(Learning_Target,dtype = "float32")
Test_Target = np.array(Test_Target,dtype = "float32")
# DEFINE SIMPLE MLP MODEL
inputs = tf.keras.layers.Input(shape=(1,))
x = tf.keras.layers.Dense(100, activation='relu')(inputs)
y = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs=inputs, outputs=y)
# TRAIN MODEL
opt = tf.keras.optimizers.RMSprop(learning_rate = 0.001,
rho = 0.9,
momentum = 0.0,
epsilon = 1e-07,
centered = False)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)
model.compile(optimizer = opt,
loss = 'mse',
metrics = ['mse'])
model.fit(Learning_Dataset,
Learning_Target,
epochs=500,
validation_split = 0.2,
verbose=0,
callbacks=[early_stop],
shuffle = False,
batch_size = 100)
# INFERENCE AND CHECK ACCURACY
Predictions = model.predict(Test_Dataset)
Predictions = Predictions.reshape(10000)
print(np.square(np.subtract(Test_Target,Predictions)).mean()) # MSE
plt.plot(Test_Dataset,Test_Target,'o',Test_Dataset,Predictions,'o')
plt.legend(('Target','Model Prediction'))
plt.show()
What am i doing wrong?
Thanks

Subprocessing Data Loading in pytroch into Google Colab

I'm working on training a deep neural network using pytorch and I use DataLoader for preprocessing data and multi-processing purpose over dataset. I set num_workers attribute to positive number like 4 and my batch_size is 8. I train network on Google Colab Environment but when training keep on after few minutes, stop training and get error in reading .PNG files. I think it's memory error and I want to know what is relation between number of GPU and batch_size and num_workers to set up a reasonable relation between them specially in Google Colab .
I think you can follow this page:
https://colab.research.google.com/notebook#fileId=1jxUPzMsAkBboHMQtGyfv5M5c7hU8Ss2c&scrollTo=EM7EnBoyK8nR
It provide a guide of how to set settings of Google Colab.
I try it and feels really fast.
Hope you love it.
Following is the code it provides but I change a bit about install pytorch:
#!/usr/bin/env python
# encoding: utf-8
import sys
sys.version
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'
!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
input_size = 784 # The image size = 28 x 28 = 784
hidden_size = 500 # The number of nodes at the hidden layer
num_classes = 10 # The number of output classes. In this case, from 0 to 9
num_epochs = 5 # The number of times entire dataset is trained
batch_size = 100 # The size of input data took for one iteration
learning_rate = 1e-3 # The speed of convergence
train_dataset = dsets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset = dsets.MNIST(root='./data',
train=False,
transform=transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__() # Inherited from the parent class nn.Module
self.fc1 = nn.Linear(input_size, hidden_size) # 1st Full-Connected Layer: 784 (input data) -> 500 (hidden node)
self.relu = nn.ReLU() # Non-Linear ReLU Layer: max(0,x)
self.fc2 = nn.Linear(hidden_size, num_classes) # 2nd Full-Connected Layer: 500 (hidden node) -> 10 (output class)
def forward(self, x): # Forward pass: stacking each layer together
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
net = Net(input_size, hidden_size, num_classes)
use_cuda = True
if use_cuda and torch.cuda.is_available():
net.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader): # Load a batch of images with its (index, data, class)
images = Variable(images.view(-1, 28*28)) # Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
labels = Variable(labels)
if use_cuda and torch.cuda.is_available():
images = images.cuda()
labels = labels.cuda()
optimizer.zero_grad() # Intialize the hidden weight to all zeros
outputs = net(images) # Forward pass: compute the output class given a image
loss = criterion(outputs, labels) # Compute the loss: difference between the output class and the pre-given label
loss.backward() # Backward pass: compute the weight
optimizer.step() # Optimizer: update the weights of hidden nodes
if (i+1) % 100 == 0: # Logging
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
correct = 0
total = 0
for images, labels in test_loader:
images = Variable(images.view(-1, 28*28))
if use_cuda and torch.cuda.is_available():
images = images.cuda()
labels = labels.cuda()
outputs = net(images)
_, predicted = torch.max(outputs.data, 1) # Choose the best class from the output: The class with the best score
total += labels.size(0) # Increment the total count
correct += (predicted == labels).sum() # Increment the correct count
print('Accuracy of the network on the 10K test images: %d %%' % (100 * correct / total))
torch.save(net.state_dict(), 'fnn_model.pkl')

Why does my CIFAR 100 CNN model mainly predict two classes?

I am currently trying to get a decent score (> 40% accuracy) with Keras on CIFAR 100. However, I'm experiencing a weird behaviour of a CNN model: It tends to predict some classes (2 - 5) much more often than others:
The pixel at position (i, j) contains the count how many elements of the validation set from class i were predicted to be of class j. Thus the diagonal contains the correct classifications, everything else is an error. The two vertical bars indicate that the model often predicts those classes, although it is not the case.
CIFAR 100 is perfectly balanced: All 100 classes have 500 training samples.
Why does the model tend to predict some classes MUCH more often than other classes? How can this be fixed?
The code
Running this takes a while.
#!/usr/bin/env python
from __future__ import print_function
from keras.datasets import cifar100
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import numpy as np
batch_size = 32
nb_classes = 100
nb_epoch = 50
data_augmentation = True
# input image dimensions
img_rows, img_cols = 32, 32
# The CIFAR10 images are RGB.
img_channels = 3
# The data, shuffled and split between train and test sets:
(X, y), (X_test, y_test) = cifar100.load_data()
X_train, X_val, y_train, y_val = train_test_split(X, y,
test_size=0.20,
random_state=42)
# Shuffle training data
perm = np.arange(len(X_train))
np.random.shuffle(perm)
X_train = X_train[perm]
y_train = y_train[perm]
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_val.shape[0], 'validation samples')
print(X_test.shape[0], 'test samples')
# Convert class vectors to binary class matrices.
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
Y_val = np_utils.to_categorical(y_val, nb_classes)
model = Sequential()
model.add(Convolution2D(32, 3, 3, border_mode='same',
input_shape=X_train.shape[1:]))
model.add(Activation('relu'))
model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Convolution2D(64, 3, 3, border_mode='same'))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_val /= 255
X_test /= 255
if not data_augmentation:
print('Not using data augmentation.')
model.fit(X_train, Y_train,
batch_size=batch_size,
nb_epoch=nb_epoch,
validation_data=(X_val, y_val),
shuffle=True)
else:
print('Using real-time data augmentation.')
# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(
featurewise_center=False, # set input mean to 0 over the dataset
samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=False, # divide inputs by std of the dataset
samplewise_std_normalization=False, # divide each input by its std
zca_whitening=False, # apply ZCA whitening
rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)
width_shift_range=0.1, # randomly shift images horizontally (fraction of total width)
height_shift_range=0.1, # randomly shift images vertically (fraction of total height)
horizontal_flip=True, # randomly flip images
vertical_flip=False) # randomly flip images
# Compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied).
datagen.fit(X_train)
# Fit the model on the batches generated by datagen.flow().
model.fit_generator(datagen.flow(X_train, Y_train,
batch_size=batch_size),
samples_per_epoch=X_train.shape[0],
nb_epoch=nb_epoch,
validation_data=(X_val, Y_val))
model.save('cifar100.h5')
Visualization code
#!/usr/bin/env python
"""Analyze a cifar100 keras model."""
from keras.models import load_model
from keras.datasets import cifar100
from sklearn.model_selection import train_test_split
import numpy as np
import json
import io
import matplotlib.pyplot as plt
try:
to_unicode = unicode
except NameError:
to_unicode = str
n_classes = 100
def plot_cm(cm, zero_diagonal=False):
"""Plot a confusion matrix."""
n = len(cm)
size = int(n / 4.)
fig = plt.figure(figsize=(size, size), dpi=80, )
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
res = ax.imshow(np.array(cm), cmap=plt.cm.viridis,
interpolation='nearest')
width, height = cm.shape
fig.colorbar(res)
plt.savefig('confusion_matrix.png', format='png')
# Load model
model = load_model('cifar100.h5')
# Load validation data
(X, y), (X_test, y_test) = cifar100.load_data()
X_train, X_val, y_train, y_val = train_test_split(X, y,
test_size=0.20,
random_state=42)
# Calculate confusion matrix
y_val_i = y_val.flatten()
y_val_pred = model.predict(X_val)
y_val_pred_i = y_val_pred.argmax(1)
cm = np.zeros((n_classes, n_classes), dtype=np.int)
for i, j in zip(y_val_i, y_val_pred_i):
cm[i][j] += 1
acc = sum([cm[i][i] for i in range(100)]) / float(cm.sum())
print("Validation accuracy: %0.4f" % acc)
# Create plot
plot_cm(cm)
# Serialize confusion matrix
with io.open('cm.json', 'w', encoding='utf8') as outfile:
str_ = json.dumps(cm.tolist(),
indent=4, sort_keys=True,
separators=(',', ':'), ensure_ascii=False)
outfile.write(to_unicode(str_))
Red herrings
tanh
I've replaced tanh by relu. The history csv looks ok, but the visualization has the same problem:
Please also note that the validation accuracy here is only 3.44%.
Dropout + tanh + border mode
Removing dropout, replacing tanh by relu, setting border mode to same everywhere: history csv
The visualization code still gives a much lower accuracy (8.50% this time) than the keras training code.
Q & A
The following is a summary of the comments:
The data is evenly distributed over the classes. So there is no "over training" of those two classes.
Data augmentation is used, but without data augmentation the problem persists.
The visualization is not the problem.
If you get good accuracy during training and validation, but not when testing, make sure you do exactly the same preprocessing on your dataset in both cases.
Here you have when training:
X_train /= 255
X_val /= 255
X_test /= 255
But no such code when predicting for your confusion matrix. Adding to testing:
X_val /= 255.
Gives the following nice looking confusion matrix:
I don't have a good feeling with this part of the code:
model.add(Dense(1024))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
The remaining model is full of relus, but here there is a tanh.
tanh sometimes vanishes or explodes (saturates at -1 and 1), which might lead to your 2-class overimportance.
keras-example cifar 10 basically uses the same architecture (dense-layer sizes might be different), but also uses a relu there (no tanh at all). The same goes for this external keras-based cifar 100 code.
One important part of the problem was that my ~/.keras/keras.json was
{
"image_dim_ordering": "th",
"epsilon": 1e-07,
"floatx": "float32",
"backend": "tensorflow"
}
Hence I had to change image_dim_ordering to tf. This leads to
and an accuracy of 12.73%. Obviously, there is still a problem as the validation history gave 45.1% accuracy.
I don't see you doing mean-centering, even in datagen. I suspect this is the main cause. To do mean centering using ImageDataGenerator, set featurewise_center = 1. Another way is to subtract the ImageNet mean from each RGB pixel. The mean vector to be subtracted is [103.939, 116.779, 123.68].
Make all activations relus, unless you have a specific reason to have a single tanh.
Remove two dropouts of 0.25 and see what happens. If you want to apply dropouts to convolution layer, it is better to use SpatialDropout2D. It is somehow removed from Keras online documentation but you can find it in the source.
You have two conv layers with same and two with valid. There is nothing wrong in this, but it would be simpler to keep all conv layers with same and control your size just based on max-poolings.

How to use keras ImageDataGenerator with a Siamese or Tripple networks

I'm trying to build up both a Siamese neural network and triple neural network on a custom large dataset
Keras has ImageDataGenerator which makes the generation of input data to a regular neural network very easy.
I'm interesting to use ImageDataGenerator or similar ways in order to train a networks with 2(siamese) and 3(triple) inputs.
In mniset keras siamese example, The input generated by a pre-process stage which is done by create_pairs method. I don't think this kind of way fit for a large dataset.
Is it possible to use ImageDataGenerator in this case? What are my other options assuming the data-set is very big?
The idea of DataGenerators is to give fit_generator a stream of data in batches.. hence giving control to you how you want to produce the data, ie whether you load from files or you do some data augmentation like what is done in ImageDataGenerator.
Here I posting the modified version of mniset siamese example with custom DataGenerator, you can work it out from here.
import numpy as np
np.random.seed(1337) # for reproducibility
import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import SGD, RMSprop
from keras import backend as K
class DataGenerator(object):
"""docstring for DataGenerator"""
def __init__(self, batch_sz):
# the data, shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# create training+test positive and negative pairs
digit_indices = [np.where(y_train == i)[0] for i in range(10)]
self.tr_pairs, self.tr_y = self.create_pairs(X_train, digit_indices)
digit_indices = [np.where(y_test == i)[0] for i in range(10)]
self.te_pairs, self.te_y = self.create_pairs(X_test, digit_indices)
self.tr_pairs_0 = self.tr_pairs[:, 0]
self.tr_pairs_1 = self.tr_pairs[:, 1]
self.te_pairs_0 = self.te_pairs[:, 0]
self.te_pairs_1 = self.te_pairs[:, 1]
self.batch_sz = batch_sz
self.samples_per_train = (self.tr_pairs.shape[0]/self.batch_sz)*self.batch_sz
self.samples_per_val = (self.te_pairs.shape[0]/self.batch_sz)*self.batch_sz
self.cur_train_index=0
self.cur_val_index=0
def create_pairs(self, x, digit_indices):
'''Positive and negative pair creation.
Alternates between positive and negative pairs.
'''
pairs = []
labels = []
n = min([len(digit_indices[d]) for d in range(10)]) - 1
for d in range(10):
for i in range(n):
z1, z2 = digit_indices[d][i], digit_indices[d][i+1]
pairs += [[x[z1], x[z2]]]
inc = random.randrange(1, 10)
dn = (d + inc) % 10
z1, z2 = digit_indices[d][i], digit_indices[dn][i]
pairs += [[x[z1], x[z2]]]
labels += [1, 0]
return np.array(pairs), np.array(labels)
def next_train(self):
while 1:
self.cur_train_index += self.batch_sz
if self.cur_train_index >= self.samples_per_train:
self.cur_train_index=0
yield ([ self.tr_pairs_0[self.cur_train_index:self.cur_train_index+self.batch_sz],
self.tr_pairs_1[self.cur_train_index:self.cur_train_index+self.batch_sz]
],
self.tr_y[self.cur_train_index:self.cur_train_index+self.batch_sz]
)
def next_val(self):
while 1:
self.cur_val_index += self.batch_sz
if self.cur_val_index >= self.samples_per_val:
self.cur_val_index=0
yield ([ self.te_pairs_0[self.cur_val_index:self.cur_val_index+self.batch_sz],
self.te_pairs_1[self.cur_val_index:self.cur_val_index+self.batch_sz]
],
self.te_y[self.cur_val_index:self.cur_val_index+self.batch_sz]
)
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
'''Contrastive loss from Hadsell-et-al.'06
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
'''
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
seq = Sequential()
seq.add(Dense(128, input_shape=(input_dim,), activation='relu'))
seq.add(Dropout(0.1))
seq.add(Dense(128, activation='relu'))
seq.add(Dropout(0.1))
seq.add(Dense(128, activation='relu'))
return seq
def compute_accuracy(predictions, labels):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return labels[predictions.ravel() < 0.5].mean()
input_dim = 784
nb_epoch = 20
batch_size=128
datagen = DataGenerator(batch_size)
# network definition
base_network = create_base_network(input_dim)
input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))
# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model(input=[input_a, input_b], output=distance)
# train
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms)
model.fit_generator(generator=datagen.next_train(), samples_per_epoch=datagen.samples_per_train, nb_epoch=nb_epoch, validation_data=datagen.next_val(), nb_val_samples=datagen.samples_per_val)