How to use nn.MultiheadAttention together with nn.LSTM? - neural-network

I'm trying to build a Pytorch network for image captioning.
Currently I have a working network of Encoder and Decoder, and I want to add nn.MultiheadAttnetion layer to it (to be used as self attention).
Currently my decode looks like this:
class Decoder(nn.Module):
def __init__(self, hidden_size, embed_dim, vocab_size, layers = 1):
super(Decoder, self).__init__()
self.embed_dim = embed_dim
self.vocab_size = vocab_size
self.layers = layers
self.hidden_size = hidden_size
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = hidden_size, batch_first = True, num_layers = layers)
#self.attention = nn.MultiheadAttention(hidden_size, num_heads=1, batch_first= True)
self.fc = nn.Linear(hidden_size, self.vocab_size)
def init_hidden(self, batch_size):
h = torch.zeros(self.layers, batch_size, self.hidden_size).to(device)
c = torch.zeros(self.layers, batch_size, self.hidden_size).to(device)
return h,c
def forward(self, features, caption):
batch_size = caption.size(0)
caption_size = caption.size(1)
h,c = self.init_hidden(batch_size)
embeddings = self.embedding(caption)
lstm_input = torch.cat((features.unsqueeze(1), embeddings[:,:-1,:]), dim=1)
output, (h,c) = self.lstm(lstm_input, (h,c))
#output, _ = self.attention(output, output, output)
output = self.fc(output)
return output
def generate_caption(self, features, max_caption_size = MAX_LEN):
h,c = self.init_hidden(1)
caption = ""
embeddings = features.unsqueeze(1)
for i in range(max_caption_size):
output, (h, c) = self.lstm(embeddings, (h,c))
#output, _ = self.attention(output, output, output)
output = self.fc(output)
_, word_index = torch.max(output, dim=2) # take the word with highest probability
if word_index == vocab.get_index(END_WORD):
break
caption += vocab.get_word(word_index) + " "
embeddings = self.embedding(torch.LongTensor([word_index]).view(1,-1).to(device))
return caption
and it gives relatively good results for image captioning.
I want to add the commented out lines so the model will use Attention. But- when I do that- the model breaks, although the loss becomes extremely low (decreasing from 2.7 to 0.2 during training instead of 2.7 to 1 without the attention) - the caption generation is not really working (predicts the same word over and over again).
My questions are:
Am I using the nn.MultiheadAttention correctly? it is very weird to me that it should be used after the LSTM, but I saw this online, and it works from dimension sizes perspective
Any idea why my model breaks when I use Attention?
EDIT: I also tried to put the Attention before the LSTM, and it didn't work as well (network predicted the same caption for every picture)

Related

Issue while solving Gym Environment ; ValueError: Weights for model sequential have not yet been created

I am new to stackoverflow, so I apologize for any errors while asking a question. I am trying to solve the cartpole-v1 gym environment using a dqn agent. I am facing an issue as follows ValueError: Weights for model sequential have not yet been created. Weights are created when the Model is first called on inputs or build() is called with an input_shape. I've searched how to fix this but to no success. My tensorflow version is 2.8.0. My code for my agent is as follows. I believe, the problem is most probably due to my build_model and in the model.fit line. This is the error that I am facing
class DQNAgent0:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.95 # discount factor
self.epsilon = 1.0 # 100% exploration at the start
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
self.learning_rate = 0.001
self.model = self._build_model()
def _build_model(self):
'''model = tf.keras.Sequential([
tf.keras.layers.Dense(1),
#tf.keras.Input((self.state_size,)),
tf.keras.layers.Dense(24, activation="relu"),
tf.keras.layers.Dense(24, activation="relu"),
tf.keras.layers.Dense(self.action_size, activation="linear"),
])
model.compile(loss=tf.keras.losses.mse,
optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))'''
#model = tf.keras.Sequential()
model = tf.keras.Sequential([tf.keras.layers.Dense(1)])
model.add(tf.keras.Input(shape = self.state_size))
model.add(tf.keras.layers.Dense(24, activation = 'relu'))
model.add(tf.keras.layers.Dense(24, activation = 'relu'))
model.add(tf.keras.layers.Dense(self.action_size, activation = 'linear'))
#opt = tf.keras.optimizers.Adam(learning_rate = self.learning_rate)
#model.compile(loss = 'mse', optimizer = opt)
model.compile(loss = tf.keras.losses.mse, optimizer = tf.keras.optimizers.Adam(learning_rate = self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon:
return random .randrange(self.action_size) # exploratory action
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def replay(self, batch_size):
#creating a random sample from our memory
minibatch = random.sample(self.memory, batch_size)
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = (reward + self.gamma * np.amax(self.model.predict(next_state[0]))) # reward at current timestep + discounted future reward
target_f = self.model.predict(state)
target_f[0][action] = target #mapping future reward to the current reward
self.model.fit(tf.expand_dims(state, axis=-1), target_f, epochs = 1, verbose = 0) # fitting a model to train with state as input x and target_f as y (predicted future reward)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)

ValueError: Target size (torch.Size([128])) must be the same as input size (torch.Size([112]))

I have a training function, in which inside there are two vectors:
d_labels_a = torch.zeros(128)
d_labels_b = torch.ones(128)
Then I have these features:
# Compute output
features_a = nets[0](input_a)
features_b = nets[1](input_b)
features_c = nets[2](inputs)
And then a domain classifier (nets[4]) makes predictions:
d_pred_a = torch.squeeze(nets[4](features_a))
d_pred_b = torch.squeeze(nets[4](features_b))
d_pred_a = d_pred_a.float()
d_pred_b = d_pred_b.float()
print(d_pred_a.shape)
The error raises in the loss function: ` pred_a = torch.squeeze(nets3)
pred_b = torch.squeeze(nets3)
pred_c = torch.squeeze(nets3)
loss = criterion(pred_a, labels_a) + criterion(pred_b, labels_b) + criterion(pred_c, labels) + d_criterion(d_pred_a, d_labels_a) + d_criterion(d_pred_b, d_labels_b)
The problem is that d_pred_a/b is different from d_labels_a/b, but only after a certain point. Indeed, when I print the shape of d_pred_a/b it istorch.Size([128])but then it changes totorch.Size([112])` independently.
It comes from here:
# Compute output
features_a = nets[0](input_a)
features_b = nets[1](input_b)
features_c = nets[2](inputs)
because if I print the shape of features_a is torch.Size([128, 2048]) but it changes into torch.Size([112, 2048])
nets[0] is a VGG, like this:
class VGG16(nn.Module):
def __init__(self, input_size, batch_norm=False):
super(VGG16, self).__init__()
self.in_channels,self.in_width,self.in_height = input_size
self.block_1 = VGGBlock(self.in_channels,64,batch_norm=batch_norm)
self.block_2 = VGGBlock(64, 128,batch_norm=batch_norm)
self.block_3 = VGGBlock(128, 256,batch_norm=batch_norm)
self.block_4 = VGGBlock(256,512,batch_norm=batch_norm)
#property
def input_size(self):
return self.in_channels,self.in_width,self.in_height
def forward(self, x):
x = self.block_1(x)
x = self.block_2(x)
x = self.block_3(x)
x = self.block_4(x)
# x = self.avgpool(x)
x = torch.flatten(x,1)
return x
I solved. The problem was the last batch. I used drop_last=True in the dataloader and It worked.

Embeddings in sentiment analysis task with PyTorch and Multilayer Perception

I have the body of the code, which should work fine, I think it doesn't because of something I'm messing up here, probably having to do with the embedding.
import torch.nn as nn
class MultilayerPerceptron(nn.Module):
def __init__(self, input_size, hidden_size): # I removed output size
# Call initializer function of the super class
super(MultilayerPerceptron, self).__init__()
self.embedding = nn.Embedding(INPUT_DIM, EMBEDDING_DIM) #added this myself, maybe wrong
#self.mlp = nn.MultilayerPerceptron(EMBEDDING_DIM, HIDDEN_DIM) #also added
self.INPUT_DIM = INPUT_DIM
self.HIDDEN_DIM = HIDDEN_DIM
self.OUTPUT_DIM = OUTPUT_DIM
self.EMBEDDING_DIM = EMBEDDING_DIM
#whenever this model is called, those layers in the sequential block
#will be processed in the order given to the block.
self.model = nn.Sequential(
#nn.Flatten(), # adding this hopefully it works (it didn't)
#embeds = embedded.mean(dim=1) #god help
nn.Linear(self.INPUT_DIM, self.HIDDEN_DIM), #later on, multiply by embedding dimensionality #I removed
nn.ReLU(),
nn.Linear(self.HIDDEN_DIM, self.OUTPUT_DIM), #one layer neural network
nn.ReLU(), # do I need this?
nn.Sigmoid(),
)
def forward(self, x):
embedded = self.embedding(x)
#embedded = [sent len, batch size, emb dim]
output, hidden = self.model(embedded)
output = self.model(x) #call the model defined above for forward propagation.
return output
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100 #how do I fit this into the model??
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = MultilayerPerceptron(INPUT_DIM, HIDDEN_DIM) #MLP instead
The error I get is "mat1 and mat2 shapes cannot be multiplied (50176x100 and 25002x256)".

Where the weights get updated in this code?

I want to train a model in distributed system. I have found a code in github for distributed training where the worker node send gradient to the parameter server and the parameter server sends the average gradient to the workers. But in client/worker side code, i couldn't understand where the received gradient updates the weights and biases.
Here is client/worker side the code, it receives initial gradients from the parameter server and then calculates loss, gradients and sends the gradient value to the server again.
from __future__ import division
from __future__ import print_function
import numpy as np
import sys
import pickle as pickle
import socket
from datetime import datetime
import time
import tensorflow as tf
import cifar10
TCP_IP = 'some IP'
TCP_PORT = 5014
port = 0
port_main = 0
s = 0
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('train_dir', '/home/ubuntu/cifar10_train',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 5000,
"""Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
tf.app.flags.DEFINE_integer('log_frequency', 10,
"""How often to log results to the console.""")
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.30)
def safe_recv(size, server_socket):
data = ""
temp = ""
data = bytearray()
recv_size = 0
while 1:
try:
temp = server_socket.recv(size-len(data))
data.extend(temp)
recv_size = len(data)
if recv_size >= size:
break
except:
print("Error")
data = bytes(data)
return data
def train():
"""Train CIFAR-10 for a number of steps."""
g1 = tf.Graph()
with g1.as_default():
global_step = tf.Variable(-1, name='global_step',
trainable=False, dtype=tf.int32)
increment_global_step_op = tf.assign(global_step, global_step+1)
# Get images and labels for CIFAR-10.
images, labels = cifar10.distorted_inputs()
# Build a Graph that computes the logits predictions from the
# inference model.
logits = cifar10.inference(images)
# Calculate loss.
loss = cifar10.loss(logits, labels)
grads = cifar10.train_part1(loss, global_step)
only_gradients = [g for g, _ in grads]
class _LoggerHook(tf.train.SessionRunHook):
"""Logs loss and runtime."""
def begin(self):
self._step = -1
self._start_time = time.time()
def before_run(self, run_context):
self._step += 1
return tf.train.SessionRunArgs(loss) # Asks for loss value.
def after_run(self, run_context, run_values):
if self._step % FLAGS.log_frequency == 0:
current_time = time.time()
duration = current_time - self._start_time
self._start_time = current_time
loss_value = run_values.results
examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
sec_per_batch = float(duration / FLAGS.log_frequency)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print(format_str % (datetime.now(), self._step, loss_value,
examples_per_sec, sec_per_batch))
with tf.train.MonitoredTrainingSession(
checkpoint_dir=FLAGS.train_dir,
hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
tf.train.NanTensorHook(loss),
_LoggerHook()],
config=tf.ConfigProto(
# log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)) as mon_sess:
log_device_placement=FLAGS.log_device_placement)) as mon_sess:
global port
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((TCP_IP, port_main))
recv_size = safe_recv(17, s)
recv_size = pickle.loads(recv_size)
recv_data = safe_recv(recv_size, s)
var_vals = pickle.loads(recv_data)
s.close()
feed_dict = {}
i = 0
for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
feed_dict[v] = var_vals[i]
i = i+1
print("Received variable values from ps")
# Opening the socket and connecting to server
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((TCP_IP, port))
while not mon_sess.should_stop():
gradients, step_val = mon_sess.run(
[only_gradients, increment_global_step_op], feed_dict=feed_dict)
# sending the gradients
send_data = pickle.dumps(gradients, pickle.HIGHEST_PROTOCOL)
to_send_size = len(send_data)
send_size = pickle.dumps(to_send_size, pickle.HIGHEST_PROTOCOL)
s.sendall(send_size)
s.sendall(send_data)
# receiving the variable values
recv_size = safe_recv(17, s)
recv_size = pickle.loads(recv_size)
recv_data = safe_recv(recv_size, s)
var_vals = pickle.loads(recv_data)
feed_dict = {}
i = 0
for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
feed_dict[v] = var_vals[i]
i = i+1
s.close()
def main(argv=None): # pylint: disable=unused-argument
global port
global port_main
global s
if(len(sys.argv) != 3):
print("<port> <worker-id> required")
sys.exit()
port = int(sys.argv[1]) + int(sys.argv[2])
port_main = int(sys.argv[1])
print("Connecting to port ", port)
cifar10.maybe_download_and_extract()
if tf.gfile.Exists(FLAGS.train_dir):
tf.gfile.DeleteRecursively(FLAGS.train_dir)
tf.gfile.MakeDirs(FLAGS.train_dir)
total_start_time = time.time()
train()
print("--- %s seconds ---" % (time.time() - total_start_time))
if __name__ == '__main__':
tf.app.run()
EDIT:
Here is the train_part1() code:
def train_part1(total_loss, global_step):
"""Train CIFAR-10 model.
Create an optimizer and apply to all trainable variables. Add moving
average for all trainable variables.
Args:
total_loss: Total loss from loss().
global_step: Integer Variable counting the number of training steps
processed.
Returns:
train_op: op for training.
"""
# Variables that affect learning rate.
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
tf.summary.scalar('learning_rate', lr)
# Generate moving averages of all losses and associated summaries.
loss_averages_op = _add_loss_summaries(total_loss)
# Compute gradients.
with tf.control_dependencies([loss_averages_op]):
opt = tf.train.GradientDescentOptimizer(lr)
grads = opt.compute_gradients(total_loss)
return grads
To me it seems that line
gradients, step_val = mon_sess.run(
[only_gradients, increment_global_step_op], feed_dict=feed_dict)
receieves new values for variables in feed_dict, assign these values to variables, and makes a training step, during which it only calculates and returns the gradients, that are later sent to the parameter server. I would expect cifar10.train_part1 (the one that returns only_gradients) to depend on variable values and define the update.
Update: I looked into the code and changed my mind. Had to google and found next answer that shed some light on what is happening.
Gradients are actually not applied in this code anywhere implicitly. Instead, gradients are sent to the parameter server, parameter server averages gradients and applies them to weights, it returns the weights to the local worker, * recieved weights are used instead of local weights during session run through feed_dict* i.e. local weights are never actually updated and do not actually matter at all. The key, is that feed_dict allows to rewrite any tensor output of the session run and this code rewrites variables.

Pytorch tutorial LSTM

I was trying to implement the exercise about Sequence Models and Long-Short Term Memory Networks with Pytorch. The idea is to add an LSTM part-of-speech tagger character-level features but I can't seem to work it out. They gave as a hint that there should be two LSTMs involved, one that will output a character level representation and another one that will be in charge of predicting the Part-of-speech tag. I just can't figure out how to loop over the words level (in a sentence) and over the character (in each word of a sentence) and implement it in the forward function. Does anyone know how to do it ? Or encounter a similar situation ?
Here is my code:
class LSTMTaggerAug(nn.Module):
def __init__(self, embedding_dim_words, embedding_dim_chars, hidden_dim_words, hidden_dim_chars, vocab_size, tagset_size, charset_size):
super(LSTMTaggerAug, self).__init__()
self.hidden_dim_words = hidden_dim_words
self.hidden_dim_chars = hidden_dim_chars
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim_words)
self.char_embeddings = nn.Embedding(charset_size, embedding_dim_chars)
self.lstm_char = nn.LSTM(embedding_dim_chars, hidden_dim_chars)
self.lstm_words = nn.LSTM(embedding_dim_words + hidden_dim_chars, hidden_dim_words)
self.hidden2tag = nn.Linear(hidden_dim_words, tagset_size)
self.hidden_char = self.init_hidden(c=False)
self.hidden_words = self.init_hidden(c=True)
def init_hidden(self, c=True):
if c:
return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim_words)),
autograd.Variable(torch.zeros(1, 1, self.hidden_dim_words)))
else:
return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim_chars)),
autograd.Variable(torch.zeros(1, 1, self.hidden_dim_chars)))
def forward(self, sentence, words):
# embeds = self.word_embeddings(sentence)
for ix, word in enumerate(sentence):
chars = words[ix]
char_embeds = self.char_embeddings(chars)
lstm_char_out, self.hidden_char = self.lstm_char(
char_embeds.view(len(chars), 1, -1), self.hidden_char)
char_rep = lstm_char_out[-1]
embeds = self.word_embeddings(word)
embeds_cat = torch.cat((embeds, char_rep), dim=1)
lstm_out, self.hidden_words = self.lstm_words(embeds_cat, self.hidden_words)
tag_space = self.hidden2tag(lstm_out.view(1, -1))
tag_score = F.log_softmax(tag_space, dim=1)
if ix==0:
tag_scores = tag_score
else:
tag_scores = torch.cat((tag_scores, tag_score), 0)
return tag_scores
The most naive way to do it according to your description would be to take a sentence s stripped of punctuation. Then split it into words:
words = s.split()
Take your first character level lstm, LSTMc and apply it to every word individually to encode the words (use the last output-state of the lstm to encode the word):
encoded_words = []
for word in words:
state = state_0
for char in word:
h, state = LSTMc(one_hot_encoding(char), state)
encoded_words.append(h)
After you've encoded the words, you pass word-level the part of speech tagger lstm LSTMw on the encoded words:
state = statew_0
parts_of_speech = []
for enc_word in encoded_words:
pos, state = LSTMw(enc_word, state)
parts_of_speech.append(pos)