Why does my CIFAR 100 CNN model mainly predict two classes? - neural-network

I am currently trying to get a decent score (> 40% accuracy) with Keras on CIFAR 100. However, I'm experiencing a weird behaviour of a CNN model: It tends to predict some classes (2 - 5) much more often than others:
The pixel at position (i, j) contains the count how many elements of the validation set from class i were predicted to be of class j. Thus the diagonal contains the correct classifications, everything else is an error. The two vertical bars indicate that the model often predicts those classes, although it is not the case.
CIFAR 100 is perfectly balanced: All 100 classes have 500 training samples.
Why does the model tend to predict some classes MUCH more often than other classes? How can this be fixed?
The code
Running this takes a while.
#!/usr/bin/env python
from __future__ import print_function
from keras.datasets import cifar100
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import numpy as np
batch_size = 32
nb_classes = 100
nb_epoch = 50
data_augmentation = True
# input image dimensions
img_rows, img_cols = 32, 32
# The CIFAR10 images are RGB.
img_channels = 3
# The data, shuffled and split between train and test sets:
(X, y), (X_test, y_test) = cifar100.load_data()
X_train, X_val, y_train, y_val = train_test_split(X, y,
test_size=0.20,
random_state=42)
# Shuffle training data
perm = np.arange(len(X_train))
np.random.shuffle(perm)
X_train = X_train[perm]
y_train = y_train[perm]
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_val.shape[0], 'validation samples')
print(X_test.shape[0], 'test samples')
# Convert class vectors to binary class matrices.
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
Y_val = np_utils.to_categorical(y_val, nb_classes)
model = Sequential()
model.add(Convolution2D(32, 3, 3, border_mode='same',
input_shape=X_train.shape[1:]))
model.add(Activation('relu'))
model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Convolution2D(64, 3, 3, border_mode='same'))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_val /= 255
X_test /= 255
if not data_augmentation:
print('Not using data augmentation.')
model.fit(X_train, Y_train,
batch_size=batch_size,
nb_epoch=nb_epoch,
validation_data=(X_val, y_val),
shuffle=True)
else:
print('Using real-time data augmentation.')
# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(
featurewise_center=False, # set input mean to 0 over the dataset
samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=False, # divide inputs by std of the dataset
samplewise_std_normalization=False, # divide each input by its std
zca_whitening=False, # apply ZCA whitening
rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)
width_shift_range=0.1, # randomly shift images horizontally (fraction of total width)
height_shift_range=0.1, # randomly shift images vertically (fraction of total height)
horizontal_flip=True, # randomly flip images
vertical_flip=False) # randomly flip images
# Compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied).
datagen.fit(X_train)
# Fit the model on the batches generated by datagen.flow().
model.fit_generator(datagen.flow(X_train, Y_train,
batch_size=batch_size),
samples_per_epoch=X_train.shape[0],
nb_epoch=nb_epoch,
validation_data=(X_val, Y_val))
model.save('cifar100.h5')
Visualization code
#!/usr/bin/env python
"""Analyze a cifar100 keras model."""
from keras.models import load_model
from keras.datasets import cifar100
from sklearn.model_selection import train_test_split
import numpy as np
import json
import io
import matplotlib.pyplot as plt
try:
to_unicode = unicode
except NameError:
to_unicode = str
n_classes = 100
def plot_cm(cm, zero_diagonal=False):
"""Plot a confusion matrix."""
n = len(cm)
size = int(n / 4.)
fig = plt.figure(figsize=(size, size), dpi=80, )
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
res = ax.imshow(np.array(cm), cmap=plt.cm.viridis,
interpolation='nearest')
width, height = cm.shape
fig.colorbar(res)
plt.savefig('confusion_matrix.png', format='png')
# Load model
model = load_model('cifar100.h5')
# Load validation data
(X, y), (X_test, y_test) = cifar100.load_data()
X_train, X_val, y_train, y_val = train_test_split(X, y,
test_size=0.20,
random_state=42)
# Calculate confusion matrix
y_val_i = y_val.flatten()
y_val_pred = model.predict(X_val)
y_val_pred_i = y_val_pred.argmax(1)
cm = np.zeros((n_classes, n_classes), dtype=np.int)
for i, j in zip(y_val_i, y_val_pred_i):
cm[i][j] += 1
acc = sum([cm[i][i] for i in range(100)]) / float(cm.sum())
print("Validation accuracy: %0.4f" % acc)
# Create plot
plot_cm(cm)
# Serialize confusion matrix
with io.open('cm.json', 'w', encoding='utf8') as outfile:
str_ = json.dumps(cm.tolist(),
indent=4, sort_keys=True,
separators=(',', ':'), ensure_ascii=False)
outfile.write(to_unicode(str_))
Red herrings
tanh
I've replaced tanh by relu. The history csv looks ok, but the visualization has the same problem:
Please also note that the validation accuracy here is only 3.44%.
Dropout + tanh + border mode
Removing dropout, replacing tanh by relu, setting border mode to same everywhere: history csv
The visualization code still gives a much lower accuracy (8.50% this time) than the keras training code.
Q & A
The following is a summary of the comments:
The data is evenly distributed over the classes. So there is no "over training" of those two classes.
Data augmentation is used, but without data augmentation the problem persists.
The visualization is not the problem.

If you get good accuracy during training and validation, but not when testing, make sure you do exactly the same preprocessing on your dataset in both cases.
Here you have when training:
X_train /= 255
X_val /= 255
X_test /= 255
But no such code when predicting for your confusion matrix. Adding to testing:
X_val /= 255.
Gives the following nice looking confusion matrix:

I don't have a good feeling with this part of the code:
model.add(Dense(1024))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
The remaining model is full of relus, but here there is a tanh.
tanh sometimes vanishes or explodes (saturates at -1 and 1), which might lead to your 2-class overimportance.
keras-example cifar 10 basically uses the same architecture (dense-layer sizes might be different), but also uses a relu there (no tanh at all). The same goes for this external keras-based cifar 100 code.

One important part of the problem was that my ~/.keras/keras.json was
{
"image_dim_ordering": "th",
"epsilon": 1e-07,
"floatx": "float32",
"backend": "tensorflow"
}
Hence I had to change image_dim_ordering to tf. This leads to
and an accuracy of 12.73%. Obviously, there is still a problem as the validation history gave 45.1% accuracy.

I don't see you doing mean-centering, even in datagen. I suspect this is the main cause. To do mean centering using ImageDataGenerator, set featurewise_center = 1. Another way is to subtract the ImageNet mean from each RGB pixel. The mean vector to be subtracted is [103.939, 116.779, 123.68].
Make all activations relus, unless you have a specific reason to have a single tanh.
Remove two dropouts of 0.25 and see what happens. If you want to apply dropouts to convolution layer, it is better to use SpatialDropout2D. It is somehow removed from Keras online documentation but you can find it in the source.
You have two conv layers with same and two with valid. There is nothing wrong in this, but it would be simpler to keep all conv layers with same and control your size just based on max-poolings.

Related

Unscale predictions back to original form

This is a regression problem.
The shape of my training is: (417, 5) and the test data shape is: (105, 5). I do scaling for both using the following code:
from sklearn import preprocessing
import sklearn
from sklearn.preprocessing import MinMaxScaler
#Scale train
scaler = preprocessing.MinMaxScaler()
train_df = scaler.fit_transform(train_df)
train_df = pd.DataFrame(train_df)
#Scale test
test_df = scaler.fit_transform(test_df)
test_df = pd.DataFrame(test_df)
First four rows of training data after scaling look like below:
while '4' is the dependent variable and the rest are independent variables.
After training using deep neural network, I get predictions in scaled form. I try to unscale predictions using the following code:
scaler.inverse_transform(y_pred_dnn)
while predictions are stored in y_pred_dnn
But I get the following error:
ValueError: non-broadcastable output operand with shape (105,1) doesn't match the broadcast shape (105,5)
How do I debug the problem?
Thanks
you can solve this by separating out y before scaling. You dont need to scale y for prediction. So try:
y_train, y_test = train_df.iloc[:, 4], test_df.iloc[:, 4]
X_train, X_test = train_df.iloc[:, 1:4], test_df.iloc[:, 1:4]
After this you do te scaling on X part only and you wont need any inverse scaling

non-linear neural network regression - quadratic function is not being estimated correctly

I have mostly used ANNs for classification and only recently started to try them out for modeling continuous variables. As an exercise I generated a simple set of (x, y) pairs where y = x^2 and tried to train an ANN to learn this quadratic function.
The ANN model:
This ANN has 1 input node (ie. x), 2 hidden layers each with 2 nodes in each layer, and 1 output node. All four hidden nodes use the non-linear tanh activation function and the output node has no activation function (since it is regression).
The Data:
For the training set I randomly generated 100 numbers between (-20, 20) for x and computed y=x^2. For the testing set I randomly generated 100 numbers between (-30, 30) for x and also computed y=x^2. I then transformed all x so that they are centered around 0 and their min and max are approximately around -1.5 and 1.5. I also transformed all y similarly but made their min and max about -0.9 and 0.9. This way, all the data falls within that mid range of the tanh activation function and not way out at the extremes.
The Problem:
After training the ANN in Keras, I am seeing that only the right half of the polynomial function is being learned, and the left half is completely flat. Does anyone have any ideas why this may be happening? I tried playing around with different scaling options, as well as hidden layer specifications but no luck on that left side.
Thanks!
Attached is the code I used for everything and the image shows the plot of the scaled training x vs the predicted y. As you can see, only half of the parabola is recovered.
import numpy as np, pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
seed = 10
n = 100
X_train = np.random.uniform(-20, 20, n)
Y_train = X_train ** 2
X_test = np.random.uniform(-30, 30, n)
Y_test = X_test ** 2
#### Scale the data
x_cap = max(abs(np.array(list(X_train) + list(X_test))))
y_cap = max(abs(np.array(list(Y_train) + list(Y_test))))
x_mean = np.mean(np.array(list(X_train) + list(X_test)))
y_mean = np.mean(np.array(list(Y_train) + list(Y_test)))
X_train2 = (X_train-x_mean) / x_cap
X_test2 = (X_test-x_mean) / x_cap
Y_train2 = (Y_train-y_mean) / y_cap
Y_test2 = (Y_test-y_mean) / y_cap
X_train2 = X_train2 * (1.5 / max(X_train2))
Y_train2 = Y_train2 * (0.9 / max(Y_train2))
# define base model
def baseline_model1():
# create model
model1 = Sequential()
model1.add(Dense(2, input_dim=1, kernel_initializer='normal', activation='tanh'))
model1.add(Dense(2, input_dim=1, kernel_initializer='normal', activation='tanh'))
model1.add(Dense(1, kernel_initializer='normal'))
# Compile model
model1.compile(loss='mean_squared_error', optimizer='adam')
return model1
np.random.seed(seed)
estimator1 = KerasRegressor(build_fn=baseline_model1, epochs=100, batch_size=5, verbose=0)
estimator1.fit(X_train2, Y_train2)
prediction = estimator1.predict(X_train2)
plt.scatter(X_train2, prediction)
enter image description here
You should also consider adding more width to you hidden layer. I changed from 2 to 5 and got a very good fit. I also used more epochs as suggested from rvinas
Your network is very sensible to the initial parameters. The following will help:
Change your kernel_initializer to glorot_uniform. Your network is very small and glorot_uniform will work better in consonance with the tanh activations. Glorot uniform will encourage your weights to be initially within a more reasonable range (since it takes into account the fan-in and fan-out of each layer).
Train your model for more epochs (i.e. 1000).

Binary classification always outputs 1

I'm making my first steps on keras and I'm trying to do binary classification on the cancer dataset available in scikit-learn
# load dataset
from sklearn import datasets
cancer = datasets.load_breast_cancer()
cancer.data
# dataset into pd.dataframe
import pandas as pd
donnee = pd.concat([pd.DataFrame(data = cancer.data, columns = cancer.feature_names),
pd.DataFrame(data = cancer.target, columns = ["target"])
], axis = 1)
# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(donnee.loc[:, donnee.columns != "target"], donnee.target, test_size = 0.25, random_state = 1)
I'm trying to follow keras' tutorial here : https://keras.io/#getting-started-30-seconds-to-keras
The thing is, I always get the same loss value (6.1316862406430541), and the same accuracy (0.61538461830232527), because the predictions are always 1.
I'm not sure if it's because of a code error :
I don't know, maybe the shape of X_train is wrong ?
Or maybe I'm doing something wrong with epochs and/or batch_size.
Or if it's because of the network itself :
if I'm not mistaken, all 1 predictions is possible if there's no biases to the layers, and I don't know yet how they're initialized
But maybe it's something else, maybe 1 layer only is too few ? (if so, I wonder why keras' tutorial is 1 layer only...)
Here is my code, if you have any idea :
import keras
from keras.models import Sequential
model = Sequential()
from keras.layers import Dense
model.add(Dense(units=64, activation='relu', input_dim=30))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()
model.compile(loss = keras.losses.binary_crossentropy,
optimizer = 'rmsprop',
metrics=['accuracy']
)
model.fit(X_train.as_matrix(), y_train.as_matrix().reshape(426, -1), epochs=5, batch_size=32)
loss_and_metrics = model.evaluate(X_test.as_matrix(), y_test.as_matrix(), batch_size=128)
loss_and_metrics
classes = model.predict(X_test.as_matrix(), batch_size=128)
classes
This is a very usual case. If you check the histogram of your data you will see that there are data points in your dataset which coordinates spans from 0 to 100. When you feed such data to neural network input to sigmoid might be so big that it will suffer from underflow. In order to scale data, you could use either MinMaxScaler or StandardScaler thanks to what you'll make your data to have a span suitable for neural network computations.

MNIST ValueError when checking target in Keras

My code is as follows:
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation
(x_train, y_train), (x_test, y_test) = mnist.load_data()
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)
model = Sequential()
model.add(Dense(output_dim=500, input_shape=(28, 28)))
model.add(Activation("tanh"))
model.add(Dense(10))
model.add(Activation("softmax"))
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(x_train, y_train, nb_epoch=50, batch_size=20)
This gets the following Error:
ValueError: Error when checking target: expected activation_2 to have 3 dimensions, but got array with shape (60000, 10)
I think the shape (60000, 10) is the shape of y_train and this has 2 dimensions while expected 3 dimensions somewhere.
Where should I edit?
MNIST samples are images of 28 x 28 values (pixels). You want to classify with an ANN that only takes in a 1-dimensional array of numbers (imagine the first layer of your ANN as a 500 long row of neurons that only understand a 500 long row of numbers but not a 28x28 matrix).
To fix your error you have to reshape your data beforehand:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
Y_train = np_utils.to_categorical(y_train, 10)
Y_test = np_utils.to_categorical(y_test, 10)
...
model = Sequential()
model.add(Dense(512, input_shape=(784,)))
If you want to feed in your data in 2d to achieve more accuracy by preserving the spatial order of your images rather than flatten them out to a long list of numbers than you have to change your models architecture to a Convolutional Neural Network (lots of example code online, especially for MNIST).

How to build simple neural network on keras (not image recognition)

I am new to keras and I am trying to built my own neural network.
A task:
I need to write a system that can make decisions for the character, which may meet one or more enemies. The system can be known:
Percentage Health character
Presence of the pistol;
The number of enemies.
The answer must be in the form of one of the following:
Attack
Run
Hide (for a surprise attack)
To do nothing
To train up I made a table of "lessons":
https://i.stack.imgur.com/lD0WX.png
So here is my code:
# Create first network with Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
import numpy
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# split into input (X) and output (Y) variables
X = numpy.array([[0.5,1,1], [0.9,1,2], [0.8,0,1], [0.3,1,1], [0.6,1,2], [0.4,0,1], [0.9,1,7], [0.5,1,4], [0.1,0,1], [0.6,1,0], [1,0,0]])
Y = numpy.array([[1],[1],[1],[2],[2],[2],[3],[3],[3],[4],[4]])
# create model
model = Sequential()
model.add(Dense(3, input_dim=3, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
# Compile model
sgd = SGD(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
# Fit the model
model.fit(X, Y, nb_epoch=150)
# calculate predictions
predictions = model.predict(X)
# round predictions
rounded = [round(x) for x in predictions]
print(rounded)
Here the predictions I get.
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
The accuracy on each epoch is 0.2727 and the loss is decrease.
It's not right.
I was trying to devide learning rate by 10, changing activations and optimizers. Even data I input manually.
Can anyone tell me how to solve my simple problem. thx.
There are several problems in your code.
Number of data entries are very small compared to the NN model.
Y is represented as classes number and not as class vector. A regression model can be learnt on this but its a poor design choice.
output of softmax function is always between 0-1 .. as this is used your model only knows to spew out values between 0-1.
Here below is a bit better modified code:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
import numpy
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# split into input (X) and output (Y) variables
X = numpy.array([[0.5,1,1], [0.9,1,2], [0.8,0,1], [0.3,1,1], [0.6,1,2], [0.4,0,1], [0.9,1,7], [0.5,1,4], [0.1,0,1], [0.6,1,0], [1,0,0]])
y = numpy.array([[1],[1],[1],[2],[2],[2],[3],[3],[3],[0],[0]])
from keras.utils import np_utils
Y = np_utils.to_categorical(y, 4)
# print Y
# create model
model = Sequential()
model.add(Dense(3, input_dim=3, activation='relu'))
model.add(Dense(4, activation='softmax'))
# Compile model
# sgd = SGD(lr=0.1)
# model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# Fit the model
model.fit(X, Y, nb_epoch=700)
# calculate predictions
predictions = model.predict(X)
predictions_class = predictions.argmax(axis=-1)
print(predictions_class)
Note I have used the softmax activation as the classes are mutually exclusive