I use LSTM attention but model does not learn. How can I imporve model? - neural-network

def __init__(self):
self.lstm = nn.LSTM(input_dim,
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def attention_net(self, lstm_output, final_state):
hidden = final_state.unsqueeze(2)
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
soft_attn_weights = F.softmax(attn_weights, 1)
context = torch.bmm(lstm_output.transpose(1, 2),
return context, soft_attn_weights.cpu().data.numpy()
def forward(self, text):
output, (hn, cn) = self.lstm(text)
hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
attn_output, attention = self.attention_net(output, hn)
return self.fc(attn_output), attention`
I use LSTM + attention. Model does not learn class = 3 but give me only one class all the time.


How initialize weights of a `torch.nn.Transformer` module?

I am using a vanilla transformer architecture from the "Attention Is All You Need" paper for a sequence-to-sequence task. As shown in the following code.
Assuming that I would like to use the torch.nn.init.kaiming_uniform_ initialization method, how would one go about initializing the weights of the nn.Transformer ?
Is it necessary to use a custom encoder and decoder class in order for that to happen?
import torch
import torch.nn as nn
import math
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
import torch.nn as nn
class PositionalEncoding(nn.Module):
def __init__(self,
emb_size: int,
dropout: float,
maxlen: int = 20):
super(PositionalEncoding, self).__init__()
den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
pos = torch.arange(0, maxlen).reshape(maxlen, 1)
pos_embedding = torch.zeros((maxlen, emb_size))
pos_embedding[:, 0::2] = torch.sin(pos * den)
pos_embedding[:, 1::2] = torch.cos(pos * den)
pos_embedding = pos_embedding.unsqueeze(-2)
self.dropout = nn.Dropout(dropout)
self.register_buffer('pos_embedding', pos_embedding)
def forward(self, token_embedding: Tensor):
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
def __init__(self, vocab_size: int, emb_size):
super(TokenEmbedding, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_size)
# Initialize weights with He initialization
self.embedding.weight = nn.init.kaiming_uniform_(self.embedding.weight)
self.emb_size = emb_size
def forward(self, tokens: Tensor):
return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
# Seq2Seq Network
class Transformer(nn.Module):
def __init__(self,
src_vocab_size: int,
tgt_vocab_size: int,
num_encoder_layers: int = 1,
num_decoder_layers: int = 1,
emb_size: int = 300,
nhead: int = 3,
dim_feedforward: int = 512,
dropout: float = 0.1,
self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
self.positional_encoding = PositionalEncoding(
emb_size, dropout=dropout)
self.transformer = torch.nn.Transformer(d_model=emb_size,
self.generator = nn.Linear(emb_size, tgt_vocab_size)
def init_weights(self):
self.generator.weight = nn.init.kaiming_uniform_(self.generator.weight)
def forward(self,
src: Tensor,
trg: Tensor,
src_mask: Tensor,
tgt_mask: Tensor,
src_padding_mask: Tensor,
tgt_padding_mask: Tensor,
memory_key_padding_mask: Tensor):
# the .permute() is necessary since the positional-encoder expects tensors to be of shape
# (seq_len, batch_size, emb_length)
src_emb = self.positional_encoding(self.src_tok_emb(src).permute(1,0,2)).permute(1,0,2)
tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg).permute(1,0,2)).permute(1,0,2)
outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
return self.generator(outs)
def encode(self, src: Tensor, src_mask: Tensor):
return self.transformer.encoder(self.positional_encoding(
self.src_tok_emb(src)), src_mask)
def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
return self.transformer.decoder(self.positional_encoding(
self.tgt_tok_emb(tgt)), memory,

custom activation function in PyTorch - fix prediction

I read this post about customa ctivation function, but still I can't implement my code. My activation function can be expressed as a combination of existing PyTorch functions and it works fine function_pytorch(prediction, Q_sample). [Q_samples, is some variable I need it and it does't need gradient. ]
My activation function should receive the output of NN and , implement the function_pytorch and it's out put goes in the loss function. so:
class Activation_fun(nn.Module):
def __init__(self, prediction):
def forward(self, input, Q_samples):
return function_pytorch(input, Q_samples)
in my NN I have
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(NeuralNet, self).__init__()
self.BN0 = nn.BatchNorm1d(input_size)
self.l1 = nn.Linear(input_size, hidden_size)
self.tan = nn.Tanh()
self.BN = nn.BatchNorm1d(output_size)
#custom activation
self.l2 = Activation_fun()
def forward(self, x, q):
out = self.BN0(x)
out = self.l1(out)
out = self.tan()
out = self.BN9(out)
out = self.l2(out, q)
return out
model = NeuralNet(input_size, hidden_size, output_size)
and in my training epochs:
outputs = model(inputs, q_samples)
The problem is: my prediction remains fix if I apply my customized activation function.
Is there any problem in my implementation?

Class State get loss between function calls in Flink

I have this class:
case class IDADiscretizer(
nAttrs: Int,
nBins: Int = 5,
s: Int = 5) extends Serializable {
private[this] val log = LoggerFactory.getLogger(this.getClass)
private[this] val V = Vector.tabulate(nAttrs)(i => new IntervalHeapWrapper(nBins, i))
private[this] val randomReservoir = SamplingUtils.reservoirSample((1 to s).toList.iterator, 1)
def updateSamples(v: LabeledVector): Vector[IntervalHeapWrapper] = {
val attrs = v.vector.map(_._2)
val label = v.label
// TODO: Check for missing values
.foreach {
case (attr, i) =>
if (V(i).getNbSamples < s) {
V(i) insertValue attr // insert
} else {
if (randomReservoir(0) <= s / (i + 1)) {
//val randVal = Random nextInt s
//V(i) replace (randVal, attr)
V(i) insertValue attr
* Return the cutpoints for the discretization
def cutPoints: Vector[Vector[Double]] = V map (_.getBoundaries.toVector)
def discretize(data: DataSet[LabeledVector]): (DataSet[Vector[IntervalHeapWrapper]], Vector[Vector[Double]]) = {
val r = data map (x => updateSamples(x))
val c = cutPoints
(r, c)
Using flink, I would like to get the cutpoints after the call of discretize, but it seems the information stored in V get loss. Do I have to use Broadcast like in this question? is there a better way to access the state of class?
I've tried to call cutpoints in two ways, one with is:
def discretize(data: DataSet[LabeledVector]) = data map (x => updateSamples(x))
Then, called from outside:
val a = IDADiscretizer(nAttrs = 4)
val r = a.discretize(dataSet)
val cuts = a.cutPoints
Here, cuts is empty so I tried to compute the discretization as well as the cutpoints inside discretize:
def discretize(data: DataSet[LabeledVector]) = {
val r = data map (x => updateSamples(x))
val c = cutPoints
(r, c)
And use it like this:
val a = IDADiscretizer(nAttrs = 4)
val (d, c) = a.discretize(dataSet)
c foreach println
But the same happends.
Finally, I've also tried to make V completely public:
val V = Vector.tabulate(nAttrs)(i => new IntervalHeapWrapper(nBins, i))
Still empty
What am I doing wrong?
Related questions:
Keep keyed state across multiple transformations
Flink State backend keys atomicy and distribution
Flink: does state access across stream?
Flink: Sharing state in CoFlatMapFunction
Thanks to #TillRohrmann what I finally did was:
private[this] def computeCutPoints(x: LabeledVector) = {
val attrs = x.vector.map(_._2)
val label = x.label
.foldLeft(V) {
case (iv, (v, i)) =>
iv(i) insertValue v
* Return the cutpoints for the discretization
def cutPoints(data: DataSet[LabeledVector]): Seq[Seq[Double]] =
data.map(computeCutPoints _)
def discretize(data: DataSet[LabeledVector]): DataSet[LabeledVector] =
data.map(updateSamples _)
And then use it like this:
val a = IDADiscretizer(nAttrs = 4)
val d = a.discretize(dataSet)
val cuts = a.cutPoints(dataSet)
cuts foreach println
I do not know if it is the best way, but at least is working now.
The way Flink works is that the user defines operators/user defined functions which operate on input data coming from a source function. In order to execute a program the user code is sent to the Flink cluster where it is executed. The results of the computation has to be output to some storage system via a sink function.
Due to this, it is not possible to mix local and distributed computations easily as you are trying with your solution. What discretize does is to define a map operator which transforms the input DataSet data. This operation will be executed once you call ExecutionEnvironment#execute or DataSet#print, for example. Now the user code and the definition for IDADiscretizer is sent to the cluster where they are instantiated. Flink will update the values in an instance of IDADiscretizer which is not the same instance as the one you have on the client.

How can I set the order in which discrete objects are instantiated?

I wrote an object PathGraph which implements a graph of Nodes and various useful functions, which I intend to use for pathfinding in a simple tower defense game. I also wrote a class Path which implements Dijkstra's algorithm, and each non-static in-game unit has a Path.
The problem I am running into is that when I run the application, the code executes the code to initialize the units and, in doing so, initialize a path for each creep before building the PathGraph object (confirmed using Eclipse Scala debugger and println statements). Unfortunately however, the code to generate a path requires that the PathGraph object, and specifically the path variable (var so that I can point to a new path if the map gets updated, etc.), be initialized.
How should I fix this problem with my code? PathGraph code pasted below for reference.
object PathGraph {
private val graph:Array[Array[Node]] = buildAndFillGraph()
//val nodeDist:Double = MainGame.pixelsPerIteration
val nodeDist = .5
val numXNodes = (MainGame.gamePanelWidth.toDouble / nodeDist).toInt
val numYNodes = (MainGame.gamePanelHeight.toDouble / nodeDist).toInt
val defaultInfinity = 99999
//build every Nodes adjacent nodes
val angle = 45
val minHeight = 0
val minWidth = 0
val maxHeight = MainGame.gamePanelSize.height //game panel y value starts at 0 at TOP
val maxWidth = MainGame.gamePanelSize.width
val numPossibleAdjacentNodes = 360 / angle //360 degrees, 45 degree angle between every potentially adjacent Node
val hypotenuseLength = math.sqrt((nodeDist * nodeDist) + (nodeDist * nodeDist))
def buildGraphArray(): Array[Array[Node]] = {
println("numXNodes/nodeDist.toInt: " + (numXNodes.toDouble / nodeDist).toInt + "\n")
//build every Node in the graph
val lgraph =
(for (x <- 0 until (numXNodes / nodeDist).toInt) yield {
(for (y <- 0 until (numYNodes / nodeDist).toInt) yield {
new Node(x.toDouble * nodeDist, y.toDouble * nodeDist)//gives lgraph(x,y) notation
}).toArray //convert IndexedSeqs to Arrays
def buildAndFillGraph():Array[Array[Node]] = {
val lgraph = buildGraphArray()//Ar[Ar[Node]]
println("lgraph built")
lgraph.map(x => x.map(y => y.setAdjacentNodes(lgraph)))
//set the adjacent nodes for all nodes in the array
if (lgraph.size != numXNodes*numYNodes) println("numXNodes*numYNodes: " + numXNodes*numYNodes)
else MainGame.pathGraphBuilt = true
def getGraph() = graph
def toBuffer(): mutable.Buffer[Node] = graph.flatten.toBuffer
def toArray(): Array[Node] = graph.flatten
There are a few things you can do to improve the code:
Do not use static variables. Your PathGraph should be a class, not an object. MainGame. pathGraphBuilt is also a static variable that you can replace with a builder - see the next point.
Use a Builder pattern to differentiate between things that build and the end result. Your PathGraph logic will mostly go into the builder. Something along these lines:
case class PathGraphBuilder(nodeDist: Double, numXNodes: Double /* and so on */) {
def apply: PathGraph = buildAndFillGraph
def buildGraphArray = ...
def buildAndFillGraph = ...
class PathGraph(underlyingGraph: Array[Array[Node]]) {
def toBuffer(): mutable.Buffer[Node] = underlyingGraph.flatten.toBuffer
def toArray(): Array[Node] = underlyingGraph.flatten

Union-Find (or Disjoint Set) data structure in Scala

I am looking for an existing implementation of a union-find or disjoint set data structure in Scala before I attempt to roll my own as the optimisations look somewhat complicated.
I mean this kind of thing - where the two operations union and find are optimised.
Does anybody know of anything existing? I've obviously tried googling around.
I had written one for myself some time back which I believe performs decently. Unlike other implementations, the find is O(1) and union is O(log(n)). If you have a lot more union operations than find, then this might not be very useful. I hope you find it useful:
package week2
import scala.collection.immutable.HashSet
import scala.collection.immutable.HashMap
* Union Find implementaion.
* Find is O(1)
* Union is O(log(n))
* Implementation is using a HashTable. Each wrap has a set which maintains the elements in that wrap.
* When 2 wraps are union, then both the set's are clubbed. O(log(n)) operation
* A HashMap is also maintained to find the Wrap associated with each node. O(log(n)) operation in mainitaining it.
* If the input array is null at any index, it is ignored
class UnionFind[T](all: Array[T]) {
private var dataStruc = new HashMap[T, Wrap]
for (a <- all if (a != null))
dataStruc = dataStruc + (a -> new Wrap(a))
var timeU = 0L
var timeF = 0L
* The number of Unions
private var size = dataStruc.size
* Unions the set containing a and b
def union(a: T, b: T): Wrap = {
val st = System.currentTimeMillis()
val first: Wrap = dataStruc.get(a).get
val second: Wrap = dataStruc.get(b).get
if (first.contains(b) || second.contains(a))
else {
// below is to merge smaller with bigger rather than other way around
val firstIsBig = (first.set.size > second.set.size)
val ans = if (firstIsBig) {
first.set = first.set ++ second.set
second.set.foreach(a => {
dataStruc = dataStruc - a
dataStruc = dataStruc + (a -> first)
} else {
second.set = second.set ++ first.set
first.set.foreach(a => {
dataStruc = dataStruc - a
dataStruc = dataStruc + (a -> second)
timeU = timeU + (System.currentTimeMillis() - st)
size = size - 1
* true if they are in same set. false if not
def find(a: T, b: T): Boolean = {
val st = System.currentTimeMillis()
val ans = dataStruc.get(a).get.contains(b)
timeF = timeF + (System.currentTimeMillis() - st)
def sizeUnion: Int = size
class Wrap(e: T) {
var set = new HashSet[T]
set = set + e
def add(elem: T) {
set = set + elem
def contains(elem: T): Boolean = set.contains(elem)
Here is a simple, short and somewhat efficient mutable implementation of UnionFind:
import scala.collection.mutable
class UnionFind[T]:
private val map = new mutable.HashMap[T, mutable.HashSet[T]]
private var size = 0
def distinct = size
def addFresh(a: T): Unit =
val set = new mutable.HashSet[T]
set += a
map(a) = set
size += 1
def setEqual(a: T, b: T): Unit =
val ma = map(a)
val mb = map(b)
if !ma.contains(b) then
// redirect the elements of the smaller set to the bigger set
if ma.size > mb.size
ma ++= mb
mb.foreach { x => map(x) = ma }
mb ++= ma
ma.foreach { x => map(x) = mb }
size = size - 1
def isEqual(a: T, b: T): Boolean =
An immutable implementation of UnionFind can be useful when rollback or backtracking or proofs are necessary
An mutable implementation can avoid garbage collection for speedup
One could also consider a persistent datastructure -- works like an immutable implementation, but is using internally some mutable state for speed