predicting time series: my python code prints out a (very long) list rather than a (small) array - neural-network

I am learning neural network modeling and its uses in time series prediction.
First, thank you for reading this post and for your help :)
On this page there are various NN models (LSTM, CNN etc.) for predicting "traffic volume":
https://michael-fuchs-python.netlify.app/2020/11/01/time-series-analysis-neural-networks-for-forecasting-univariate-variables/#train-validation-split
I got inspired and decided to use/shorten/adapt the code in there for a problem of my own: predicting the bitcoin price.
I have the bitcoin daily prices starting 1.1.2017
in total 2024 daily prices
I use the first 85% of the data for the training data, and the rest as the validation (except the last 10 observation, which I would like to use as test data to see how good my model is)
I would like to use a Feedforward model
My goal is merely having a code that runs.
I have managed so far to have most of my code run. However, I get a strange format for my test forecast results: It should be simply an array of 10 numbers (i.e. predicted prices corresponding to the 10 day at the end of my data). To my surprise what is printed out is a long list of numbers. I need help to find out what changes I need to make to make to the code to make it run.
Thank you for helping me :)
The code is pasted down there, followed by the error:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing #import MinMaxScaler
from sklearn import metrics #import mean_squared_error
import seaborn as sns
sns.set()
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Dense, Flatten
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import EarlyStopping
tf.__version__
df = pd.read_csv('/content/BTC-USD.csv')
def mean_absolute_percentage_error_func(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def timeseries_evaluation_metrics_func(y_true, y_pred):
print('Evaluation metric results: ')
print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
print(f'MAPE is : {mean_absolute_percentage_error_func(y_true, y_pred)}')
print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')
def univariate_data_prep_func(dataset, start, end, window, horizon):
X = []
y = []
start = start + window
if end is None:
end = len(dataset) - horizon
for i in range(start, end):
indicesx = range(i-window, i)
X.append(np.reshape(dataset[indicesx], (window, 1)))
indicesy = range(i,i+horizon)
y.append(dataset[indicesy])
return np.array(X), np.array(y)
# Generating the test set
test_data = df['close'].tail(10)
df = df.drop(df['close'].tail(10).index)
df.shape
# Defining the target variable
uni_data = df['close']
uni_data.index = df['formatted_date']
uni_data.head()
#scaling
from sklearn import preprocessing
uni_data = uni_data.values
scaler_x = preprocessing.MinMaxScaler()
x_scaled = scaler_x.fit_transform(uni_data.reshape(-1, 1))
# Single Step Style (sss) modeling
univar_hist_window_sss = 50
horizon_sss = 1
# 2014 observations in total
# 2014*0.85=1710 should be part of the training (304 validation)
train_split_sss = 1710
x_train_uni_sss, y_train_uni_sss = univariate_data_prep_func(x_scaled, 0, train_split_sss,
univar_hist_window_sss, horizon_sss)
x_val_uni_sss, y_val_uni_sss = univariate_data_prep_func(x_scaled, train_split_sss, None,
univar_hist_window_sss, horizon_sss)
print ('Length of first Single Window:')
print (len(x_train_uni_sss[0]))
print()
print ('Target horizon:')
print (y_train_uni_sss[0])
BATCH_SIZE_sss = 32
BUFFER_SIZE_sss = 150
train_univariate_sss = tf.data.Dataset.from_tensor_slices((x_train_uni_sss, y_train_uni_sss))
train_univariate_sss = train_univariate_sss.cache().shuffle(BUFFER_SIZE_sss).batch(BATCH_SIZE_sss).repeat()
validation_univariate_sss = tf.data.Dataset.from_tensor_slices((x_val_uni_sss, y_val_uni_sss))
validation_univariate_sss = validation_univariate_sss.batch(BATCH_SIZE_sss).repeat()
n_steps_per_epoch = 55
n_validation_steps = 10
n_epochs = 100
#FFNN architecture
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(8, input_shape=x_train_uni_sss.shape[-2:]),
tf.keras.layers.Dense(units=horizon_sss)])
model.compile(loss='mse',
optimizer='adam')
#fit the model
model_path = '/content/FFNN_model_sss.h5'
keras_callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss',
min_delta=0, patience=10,
verbose=1, mode='min'),
tf.keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss',
save_best_only=True,
mode='min', verbose=0)]
history = model.fit(train_univariate_sss, epochs=n_epochs, steps_per_epoch=n_steps_per_epoch,
validation_data=validation_univariate_sss, validation_steps=n_validation_steps, verbose =1,
callbacks = keras_callbacks)
#validation
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
# Testing our model
trained_ffnn_model_sss = tf.keras.models.load_model(model_path)
df_temp = df['close']
test_horizon = df_temp.tail(univar_hist_window_sss)
test_history = test_horizon.values
result = []
# Define Forecast length here
window_len = len(test_data)
test_scaled = scaler_x.fit_transform(test_history.reshape(-1, 1))
for i in range(1, window_len+1):
test_scaled = test_scaled.reshape((1, test_scaled.shape[0], 1))
# Inserting the model
predicted_results = trained_ffnn_model_sss.predict(test_scaled)
print(f'predicted : {predicted_results}')
result.append(predicted_results[0])
test_scaled = np.append(test_scaled[:,1:],[[predicted_results]])
result_inv_trans = scaler_x.inverse_transform(result)
result_inv_trans
I believe the problem might have to do with the shapes of data. How exactly I do not yet know.
Data:
click here
Traceback:
click here

Related

How can I get every layers output value with keras?

I want use keras Lstm to get the time series features, then use the features to Kmeans. But now I can not get the layers output values. How can I get the layers output values?
This is my lstm network
Layer (type) Output Shape Param #
lstm_66 (LSTM) (None, None, 50) 10400
lstm_67 (LSTM) (None, 100) 60400
dense_19 (Dense) (None, 1) 101
activation_19 (Activation) (None, 1) 0
I want to get the lstm_67 output values,my code is:
import keras.backend as K
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
sess = tf.Session()
sess.run(tf.global_variables_initializer())
import numpy as np
statesAll=[]
layers = model.layers
print layers[1].output,type(layers[1].output[1]),sess.run(layers[1].output)
and the result is:
Tensor("lstm_61/TensorArrayReadV3:0", shape=(?, 100), dtype=float32)
So, how can I get the layers output value?
Thanks!
But it not work,my code is:
def load_data(file_name, sequence_length=10, split=0.8):
df = pd.read_csv(file_name, sep=',', usecols=[1])
data_all = np.array(df).astype(float)
scaler = MinMaxScaler()
data_all = scaler.fit_transform(data_all)
data = []
print len(data_all)
for i in range(len(data_all) - sequence_length - 1):
data.append(data_all[i: i + sequence_length + 1])
reshaped_data = np.array(data).astype('float64')
np.random.shuffle(reshaped_data)
x = reshaped_data[:, :-1]
y = reshaped_data[:, -1]
split_boundary = int(reshaped_data.shape[0] * split)
train_x = x[: split_boundary]
test_x = x[split_boundary:]
train_y = y[: split_boundary]
test_y = y[split_boundary:]
return train_x, train_y, test_x, test_y, scaler
def build_model(n_samples, time_steps, input_dim):
model = Sequential()
model.add(LSTM(input_dim=1, output_dim=50,return_sequences=True))
model.add(LSTM(100, return_sequences=False))
model.add(Dense(output_dim=1))
model.add(Activation('linear'))
model.compile(loss='mse', optimizer='rmsprop')
print(model.layers)
return model
def train_model(train_x, train_y, test_x, test_y):
model = build_model()
model.fit(train_x, train_y, batch_size=128, nb_epoch=30,validation_split=0.1)
return model
train_x, train_y, test_x, test_y, scaler = load_data(file path)
train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1], 1))
test_x = np.reshape(test_x, (test_x.shape[0], test_x.shape[1], 1))
model = train_model(train_x, train_y, test_x, test_y)
from keras import backend as K
layers = model.layers
K.eval(layers[1].output)
In TensorFlow 2.x, you can do like this:
from tensorflow.python.keras import backend as K
model = build_model()
# lstm_67 is the second layer.
lstm = K.function([model.layers[0].input], [model.layers[1].output])
lstm_output = lstm([test_x])[0]
keras.backend.eval() should do.
Look at the documentation here and here
First of all, this is a tensor, you need to use the tf. Print () method to see the specific value. If you use Spyder, you will not see this information in the console. You need to execute this program in the command line.

One-hot vector prediction always returns the same value

My deep neural network returns the same output for every input. I tried (with no luck) different variations of:
loss
optimizer
network topology / layers types
number of epochs (1-100)
I have 3 outputs (one-hot) and for every input output they are like (it changes after every training):
4.701869785785675049e-01 4.793547391891479492e-01 2.381391078233718872e-01
This problem happens probably because of highly random nature of my training data (stock prediction).
The data set is also heavily shifted towards one of the answers (that's why I used sample_weight - calculated proportionally).
I think I can rule out overfitting (it happens even for 1 epoch and I have dropout layers).
One of the examples of my network:
xs_conv = xs.reshape(xs.shape[0], xs.shape[1], 1)
model_conv = Sequential()
model_conv.add(Conv1D(128, 15, input_shape=(input_columns,1), activation='relu'))
model_conv.add(MaxPooling1D(pool_size=3))
model_conv.add(Dropout(0.4))
model_conv.add(Conv1D(64, 15, input_shape=(input_columns,1), activation='relu'))
model_conv.add(MaxPooling1D(pool_size=3))
model_conv.add(Dropout(0.4))
model_conv.add(Flatten())
model_conv.add(Dense(128, activation='relu'))
model_conv.add(Dropout(0.4))
model_conv.add(Dense(3, activation='sigmoid'))
model_conv.compile(loss='mean_squared_error', optimizer='nadam', metrics=['accuracy'])
model_conv.fit(xs_conv, ys, epochs=10, batch_size=16, sample_weight=sample_weight, validation_split=0.3, shuffle=True)
I would understand if the outputs were random, but what happens seems very peculiar. Any ideas?
Data: computed.csv
Whole code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Input, Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras.models import Model, Sequential
from keras import backend as K
import random
DATA_DIR = '../../Data/'
INPUT_DATA_FILE = DATA_DIR + 'computed.csv'
def get_y(row):
profit = 0.010
hot_one = [0,0,0]
hot_one[0] = int(row.close_future_5 >= profit)
hot_one[1] = int(row.close_future_5 <= -profit)
hot_one[2] = int(row.close_future_5 < profit and row.close_future_10 > -profit)
return hot_one
def rolling_window(window, arr):
return [np.array(arr[i:i+window]).transpose().flatten().tolist() for i in range(0, len(arr))][0:-window+1]
def prepare_data(data, widnow, test_split):
xs1 = data.iloc[:,1:26].as_matrix()
ys1 = [get_y(row) for row in data.to_records()]
xs = np.array(rolling_window(window, xs1)).tolist()
ys = ys1[0:-window+1]
zipped = list(zip(xs, ys))
random.shuffle(zipped)
train_size = int((1.0 - test_split) * len(data))
xs, ys = zip(*zipped[0:train_size])
xs_test, ys_test = zip(*zipped[train_size:])
return np.array(xs), np.array(ys), np.array(xs_test), np.array(ys_test)
def get_sample_weight(y):
if(y[0]): return ups_w
elif(y[1]): return downs_w
else: return flats_w
data = pd.read_csv(INPUT_DATA_FILE)
window = 30
test_split = .9
xs, ys, xs_test, ys_test = prepare_data(data, window, test_split)
ups_cnt = sum(y[0] for y in ys)
downs_cnt = sum(y[1] for y in ys)
flats_cnt = sum(y[0] == False and y[1] == False for y in ys)
total_cnt = ups_cnt + downs_cnt + flats_cnt
ups_w = total_cnt/ups_cnt
downs_w = total_cnt/downs_cnt
flats_w = total_cnt/flats_cnt
sample_weight = np.array([get_sample_weight(y) for y in ys])
_, input_columns = xs.shape
xs_conv = xs.reshape(xs.shape[0], xs.shape[1], 1)
model_conv = Sequential()
model_conv.add(Conv1D(128, 15, input_shape=(input_columns,1), activation='relu'))
model_conv.add(MaxPooling1D(pool_size=3))
model_conv.add(Dropout(0.4))
model_conv.add(Conv1D(64, 15, input_shape=(input_columns,1), activation='relu'))
model_conv.add(MaxPooling1D(pool_size=3))
model_conv.add(Dropout(0.4))
model_conv.add(Flatten())
model_conv.add(Dense(128, activation='relu'))
model_conv.add(Dropout(0.4))
model_conv.add(Dense(3, activation='sigmoid'))
model_conv.compile(loss='mean_squared_error', optimizer='nadam', metrics=['accuracy'])
model_conv.fit(xs_conv, ys, epochs=1, batch_size=16, sample_weight=sample_weight, validation_split=0.3, shuffle=True)
xs_test_conv = xs_test.reshape(xs_test.shape[0], xs_test.shape[1], 1)
res = model_conv.predict(xs_test_conv)
plotdata = pd.concat([pd.DataFrame(res, columns=['res_up','res_down','res_flat']), pd.DataFrame(ys_test, columns=['ys_up','ys_down','y_flat'])], axis = 1)
plotdata[['res_up', 'ys_up']][3000:3500].plot(figsize=(20,4))
plotdata[['res_down', 'ys_down']][3000:3500].plot(figsize=(20,4))
I have run your model with the attached data and so far can say that the biggest problem is lack of data cleaning.
For instance, there's a inf value in .csv at line 623. After I've filtered them all out with
xs1 = xs1[np.isfinite(xs1).all(axis=1)]
... I collected some statistics over xs, namely min, max and mean. They turned out pretty remarkable:
-43.0049723138
32832.3333333 # !!!
0.213126234391
On average, the values are close to 0, but some are 6 orders of magnitude higher. These particular rows definitely hurt the neural network, so you should either filter them as well or come up with a clever way to normalize the features.
But even with them, the model ended up with 71-79% validation accuracy. The result distribution is a bit skewed towards the 3rd class, but in general pretty diverse to name it peculiar: 19% for class 1, 7% for class 2, 73% for class 3. Example test output:
[[ 1.93120316e-02 4.47684433e-04 9.97518778e-01]
[ 1.40607255e-02 2.45630667e-02 9.74113524e-01]
[ 3.07740629e-01 4.80920941e-01 2.28664145e-01]
...,
[ 5.72797097e-02 9.45571139e-02 8.07634115e-01]
[ 1.05512664e-01 8.99530351e-02 6.70437515e-01]
[ 5.24505274e-03 1.46622911e-01 9.42657173e-01]]

How to use different activation functions in one Keras layer?

I am working on Keras in Python and I have a neural network (see code below).
Currently it works with only a ReLu activation.
For experimental reasons I would like to have some neurons on ReLu and some on softmax (or any other activation function). for example in a Layer with 20 neurons, I would like to have 10 with ReLu and 10 with Softmax.
I have tried some different ways, but always failed to get an output.
Would you know how I should do this?
# - Libraries
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
early_spotting_monitor = EarlyStopping(patience=2)
layers = 4
neurons = 20
act = "ReLu"
# - Create Neural Network
model = Sequential()
model.add(Dense(neurons,activation=act,input_dim=X_train.shape[1]))
layers -= 1
while layers > 0:
model.add(Dense(neurons,activation=act))
layers -= 1
model.add(Dense(n_months))
model.compile(optimizer="adam",loss="mean_absolute_error")
model.fit(X_train,Y_train,validation_split=0.10,epochs=13,callbacks=[early_spotting_monitor])
EDIT: this is my (working) code now:
# - Libraries
from keras.callbacks import EarlyStopping
early_spotting_monitor = EarlyStopping(patience=2)
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers.merge import concatenate
# input layer
visible = Input(shape=(X_train.shape[1],))
hidden11 = Dense(14, activation='relu')(visible)
hidden12 = Dense(3, activation='softplus')(visible)
hidden13 = Dense(2, activation='linear')(visible)
hidden13 = Dense(2, activation='selu')(visible)
merge1 = concatenate([hidden11, hidden12, hidden13])
hidden21 = Dense(14, activation='relu')(merge1)
hidden22 = Dense(3, activation='softplus')(merge1)
hidden23 = Dense(2, activation='linear')(merge1)
hidden13 = Dense(2, activation='selu')(visible)
merge2 = concatenate([hidden21, hidden22, hidden23])
hidden3 = Dense(20, activation='relu')(merge2)
output = Dense(Y_train.shape[1],activation="linear")(hidden3)
model = Model(inputs=visible, outputs=output)
model.compile(optimizer="adam",loss="mean_absolute_error")
model.fit(X_train,Y_train,validation_split=0.10,epochs=13,callbacks=[early_spotting_monitor]) # starts training
return model
You have to use the Functional API to do this, for example:
input = Input(shape = (X_train.shape[1]))
branchA = Dense(neuronsA, activation = "relu")(input)
branchB = Dense(neuronsB, activation = "sigmoid")(input)
out = concatenate([branchA, branchB])
You cannot do it with the Sequential API, so I recommend you move your code to the functional API.
So this is something I have been trying to do recently and so far this is what I have done. I think it's working, but I would appreciate if anyone tells me what I'm doing wrong here. I'm doing this only on the output layer and my output layer has two units:
def activations(l):
l_0 = tf.keras.activations.exponential(l[...,0])
l_1 = tf.keras.activations.elu(l[...,1])
lnew = tf.stack([l_0, l_1], axis = 1)
return lnew
model = tf.keras.Sequential([..., Dense(2, activation = activations)])

Using the deep neural network package "Chainer" to train a simple dataset

I'm trying to use the chainer package for a large project I'm working on. I have read through the tutorial on their website which gives an example of applying it to the MNIST dataset, but it doesn't seem to scale easily to other examples, and there's simply not enough documentation otherwise.
Their example code is as follows:
class MLP(Chain):
def __init__(self, n_units, n_out):
super(MLP, self).__init__(
# the size of the inputs to each layer will be inferred
l1=L.Linear(None, n_units), # n_in -> n_units
l2=L.Linear(None, n_units), # n_units -> n_units
l3=L.Linear(None, n_out), # n_units -> n_out
)
def __call__(self, x):
h1 = F.relu(self.l1(x))
h2 = F.relu(self.l2(h1))
y = self.l3(h2)
return y
train, test = datasets.get_mnist()
train_iter = iterators.SerialIterator(train, batch_size=5, shuffle=True)
test_iter = iterators.SerialIterator(test, batch_size=2, repeat=False, shuffle=False)
model = L.Classifier(MLP(100, 10)) # the input size, 784, is inferred
optimizer = optimizers.SGD()
optimizer.setup(model)
updater = training.StandardUpdater(train_iter, optimizer)
trainer = training.Trainer(updater, (4, 'epoch'), out='result')
trainer.extend(extensions.Evaluator(test_iter, model))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(['epoch', 'main/accuracy', 'validation/main/accuracy']))
trainer.extend(extensions.ProgressBar())
trainer.run()
Could someone point me in the direction of how to simple fit a straight line to a few data points in 2D? If I can understand a simple fit such as this I should be able to scale appropriately.
Thanks for the help!
I pasted simple regression modeling here.
You can use original train data and test data as tuple.
train = (data, label)
Here, data.shape = (Number of data, Number of data dimesion)
And, label.shape = (Number of data,)
Both of their data type should be numpy.float32.
import chainer
from chainer.functions import *
from chainer.links import *
from chainer.optimizers import *
from chainer import training
from chainer.training import extensions
from chainer import reporter
from chainer import datasets
import numpy
class MyNet(chainer.Chain):
def __init__(self):
super(MyNet, self).__init__(
l0=Linear(None, 30, nobias=True),
l1=Linear(None, 1, nobias=True),
)
def __call__(self, x, t):
l0 = self.l0(x)
f0 = relu(l0)
l1 = self.l1(f0)
f1 = flatten(l1)
self.loss = mean_squared_error(f1, t)
reporter.report({'loss': self.loss}, self)
return self.loss
def get_optimizer():
return Adam()
def training_main():
model = MyNet()
optimizer = get_optimizer()
optimizer.setup(model)
train, test = datasets.get_mnist(label_dtype=numpy.float32)
train_iter = chainer.iterators.SerialIterator(train, 50)
test_iter = chainer.iterators.SerialIterator(test, 50,
repeat=False,
shuffle=False)
updater = training.StandardUpdater(train_iter, optimizer)
trainer = training.Trainer(updater, (10, 'epoch'))
trainer.extend(extensions.ProgressBar())
trainer.extend(extensions.Evaluator(test_iter, model))
trainer.extend(
extensions.PlotReport(['main/loss', 'validation/main/loss'],
'epoch'))
trainer.run()
if __name__ == '__main__':
training_main()

Duplicate values in read from file minibatches TensorFlow

I followed the tutorial about Reading data with TF and made some tries myself. Now, the problem is that my tests show duplicate data in the batches I created when reading data from a CSV.
My code looks like this:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import collections
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
class XICSDataSet:
def __init__(self, height=20, width=195, batch_size=1000, noutput=15):
self.depth = 1
self.height = height
self.width = width
self.batch_size = batch_size
self.noutput = noutput
def trainingset_files_reader(self, data_dir, nfiles):
fnames = [os.path.join(data_dir, "test%d"%i) for i in range(nfiles)]
filename_queue = tf.train.string_input_producer(fnames, shuffle=False)
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
record_defaults = [[.0],[.0],[.0],[.0],[.0]]
data_tuple = tf.decode_csv(value, record_defaults=record_defaults, field_delim = ' ')
features = tf.pack(data_tuple[:-self.noutput])
label = tf.pack(data_tuple[-self.noutput:])
depth_major = tf.reshape(features, [self.height, self.width, self.depth])
min_after_dequeue = 100
capacity = min_after_dequeue + 30 * self.batch_size
example_batch, label_batch = tf.train.shuffle_batch([depth_major, label], batch_size=self.batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
with tf.Graph().as_default():
ds = XICSDataSet(2, 2, 3, 1)
im, lb = ds.trainingset_files_reader(filename, 1)
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
tf.train.start_queue_runners(sess=sess)
for i in range(1000):
lbs = sess.run([im, lb])[1]
_, nu = np.unique(lbs, return_counts=True)
if np.array_equal(nu, np.array([1, 1, 1])) == False:
print('Not unique elements found in a batch!')
print(lbs)
I tried with different batch sizes, different number of files, different values of capacity and min_after_dequeue, but I always get the problem. In the end, I would like to be able to read data from only one file, creating batches and shuffling the examples.
My files, created ad hoc for this test, have 5 lines each representing samples, and 5 columns. The last column is meant to be the label for that sample. These are just random numbers. I'm using only 10 files just to test this out.
The default behavior for tf.train.string_input_producer(fnames) is to produce an infinite number of copies of the elements in fnames. Therefore, since your tf.train.shuffle_batch() capacity is larger than the total number of elements in your input files (5 elements per file * 10 files = 50 elements), and the min_after_dequeue is also larger than the number of elements, the queue will contain at least two full copies of the input data before the first batch is produced. As a result, it is likely that some batches will contain duplicate data.
If you only want to process each example once, you can set an explicit num_epochs=1 when creating the tf.train.string_input_producer(). For example:
def trainingset_files_reader(self, data_dir, nfiles):
fnames = [os.path.join(data_dir, "test%d" % i) for i in range(nfiles)]
filename_queue = tf.train.string_input_producer(
fnames, shuffle=False, num_epochs=1)
# ...