I read this post about customa ctivation function, but still I can't implement my code. My activation function can be expressed as a combination of existing PyTorch functions and it works fine function_pytorch(prediction, Q_sample). [Q_samples, is some variable I need it and it does't need gradient. ]
My activation function should receive the output of NN and , implement the function_pytorch and it's out put goes in the loss function. so:
class Activation_fun(nn.Module):
def __init__(self, prediction):
super().__init__()
def forward(self, input, Q_samples):
return function_pytorch(input, Q_samples)
in my NN I have
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(NeuralNet, self).__init__()
self.BN0 = nn.BatchNorm1d(input_size)
self.l1 = nn.Linear(input_size, hidden_size)
self.tan = nn.Tanh()
self.BN = nn.BatchNorm1d(output_size)
#custom activation
self.l2 = Activation_fun()
def forward(self, x, q):
out = self.BN0(x)
out = self.l1(out)
out = self.tan()
out = self.BN9(out)
out = self.l2(out, q)
return out
model = NeuralNet(input_size, hidden_size, output_size)
and in my training epochs:
outputs = model(inputs, q_samples)
The problem is: my prediction remains fix if I apply my customized activation function.
Is there any problem in my implementation?
Related
I am using a vanilla transformer architecture from the "Attention Is All You Need" paper for a sequence-to-sequence task. As shown in the following code.
Assuming that I would like to use the torch.nn.init.kaiming_uniform_ initialization method, how would one go about initializing the weights of the nn.Transformer ?
Is it necessary to use a custom encoder and decoder class in order for that to happen?
import torch
import torch.nn as nn
import math
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
import torch.nn as nn
class PositionalEncoding(nn.Module):
def __init__(self,
emb_size: int,
dropout: float,
maxlen: int = 20):
super(PositionalEncoding, self).__init__()
den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
pos = torch.arange(0, maxlen).reshape(maxlen, 1)
pos_embedding = torch.zeros((maxlen, emb_size))
pos_embedding[:, 0::2] = torch.sin(pos * den)
pos_embedding[:, 1::2] = torch.cos(pos * den)
pos_embedding = pos_embedding.unsqueeze(-2)
self.dropout = nn.Dropout(dropout)
self.register_buffer('pos_embedding', pos_embedding)
def forward(self, token_embedding: Tensor):
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
def __init__(self, vocab_size: int, emb_size):
super(TokenEmbedding, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_size)
# Initialize weights with He initialization
self.embedding.weight = nn.init.kaiming_uniform_(self.embedding.weight)
self.emb_size = emb_size
def forward(self, tokens: Tensor):
return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
# Seq2Seq Network
class Transformer(nn.Module):
def __init__(self,
src_vocab_size: int,
tgt_vocab_size: int,
num_encoder_layers: int = 1,
num_decoder_layers: int = 1,
emb_size: int = 300,
nhead: int = 3,
dim_feedforward: int = 512,
dropout: float = 0.1,
activation_function='relu'):
super().__init__()
self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
self.positional_encoding = PositionalEncoding(
emb_size, dropout=dropout)
self.transformer = torch.nn.Transformer(d_model=emb_size,
nhead=nhead,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=dim_feedforward,
dropout=dropout,
batch_first=True,
activation=activation_function)
self.generator = nn.Linear(emb_size, tgt_vocab_size)
def init_weights(self):
self.generator.weight = nn.init.kaiming_uniform_(self.generator.weight)
def forward(self,
src: Tensor,
trg: Tensor,
src_mask: Tensor,
tgt_mask: Tensor,
src_padding_mask: Tensor,
tgt_padding_mask: Tensor,
memory_key_padding_mask: Tensor):
# the .permute() is necessary since the positional-encoder expects tensors to be of shape
# (seq_len, batch_size, emb_length)
src_emb = self.positional_encoding(self.src_tok_emb(src).permute(1,0,2)).permute(1,0,2)
tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg).permute(1,0,2)).permute(1,0,2)
outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
return self.generator(outs)
def encode(self, src: Tensor, src_mask: Tensor):
return self.transformer.encoder(self.positional_encoding(
self.src_tok_emb(src)), src_mask)
def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
return self.transformer.decoder(self.positional_encoding(
self.tgt_tok_emb(tgt)), memory,
tgt_mask)
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(input_dim,
hidden_dim,
num_layers=num_layers,
bidirectional=bidirectional,
dropout=dropout,
batch_first=True)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def attention_net(self, lstm_output, final_state):
hidden = final_state.unsqueeze(2)
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
soft_attn_weights = F.softmax(attn_weights, 1)
context = torch.bmm(lstm_output.transpose(1, 2),
soft_attn_weights.unsqueeze(2)).squeeze(2)
return context, soft_attn_weights.cpu().data.numpy()
def forward(self, text):
output, (hn, cn) = self.lstm(text)
hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
attn_output, attention = self.attention_net(output, hn)
return self.fc(attn_output), attention`
I use LSTM + attention. Model does not learn class = 3 but give me only one class all the time.
I would like to have a bunch of generators in my config dict. So I tried this:
#yaml.register_class
class UniformDistribution:
yaml_tag = '!uniform'
#classmethod
def from_yaml(cls, a, node):
for x in node.value:
if x[0].value == 'min':
min_ = float(x[1].value)
if x[0].value == 'max':
max_ = float(x[1].value)
def f():
while True:
yield np.random.uniform(min_, max_)
g = f()
return g
However, the parser never returns because generators are used internally to resolve reference like &A and *A. Therefore, something like returning (g,) is a fairly simple workaround, but I would prefer a solution where I don't need the additional and very confusing index 0 term in next(config['position_generator'][0]).
Any Ideas?
This wrapper adapted from a different question did exactly what I was looking for.
class GeneratorWrapper(Generator):
def __init__(self, function, *args):
self.function = function
self.args = args
def send(self, ignored_arg):
return self.function(*self.args)
def throw(self, typ=None, val=None, tb=None):
raise StopIteration
#yaml.register_class
class UniformDistribution:
yaml_tag = '!uniform'
#classmethod
def from_yaml(cls, constructor, node):
for x in node.value:
value = float(x[1].value)
if x[0].value == 'min':
min_ = value
if x[0].value == 'max':
max_ = value
return GeneratorWrapper(np.random.uniform, min_, max_)
I am running the example multiple linear regression for Flink (0.10-SNAPSHOT). I can't figure out how to extract the weights (e.g. slope and intercept, beta0-beta1, what ever you want to call them). I'm not super seasoned in Scala, that is probably half my problem.
Thanks for any help any one can give.
object Job {
def main(args: Array[String]) {
// set up the execution environment
val env = ExecutionEnvironment.getExecutionEnvironment
val survival = env.readCsvFile[(String, String, String, String)]("/home/danger/IdeaProjects/quickstart/docs/haberman.data")
val survivalLV = survival
.map{tuple =>
val list = tuple.productIterator.toList
val numList = list.map(_.asInstanceOf[String].toDouble)
LabeledVector(numList(3), DenseVector(numList.take(3).toArray))
}
val mlr = MultipleLinearRegression()
.setStepsize(1.0)
.setIterations(100)
.setConvergenceThreshold(0.001)
mlr.fit(survivalLV)
println(mlr.toString()) // This doesn't do anything productive...
println(mlr.weightsOption) // Neither does this.
}
}
The problem is that you've only constructed the Flink job (DAG) which will calculate the weights but it is not yet executed. The easiest way to trigger the execution is to use the collect method which will retrieve the result of the DataSet back to your client.
mlr.fit(survivalLV)
val weights = mlr.weightsOption match {
case Some(weights) => weights.collect()
case None => throw new Exception("Could not calculate the weights.")
}
println(weights)
I am working on Apache Spark to build the LRM using the LogisticRegressionWithLBFGS() class provided by MLib. Once the Model is built, we can use the predict function provided which gives only the binary labels as the output. I also want the probabilities to be calculated for the same.
There is an implementation for the same found in
https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
override protected def predictPoint(
dataMatrix: Vector,
weightMatrix: Vector,
intercept: Double) = {
require(dataMatrix.size == numFeatures)
// If dataMatrix and weightMatrix have the same dimension, it's binary logistic regression.
if (numClasses == 2) {
val margin = dot(weightMatrix, dataMatrix) + intercept
val score = 1.0 / (1.0 + math.exp(-margin))
threshold match {
case Some(t) => if (score > t) 1.0 else 0.0
case None => score
}
}
This method is not exposed, and also the probabilities are not available. Can I know how to use this function to get probabilities.
The dot method which is used in the above function is also not exposed, it is present in the BLAS Package but it is not public.
Call myModel.clearThreshold to get the raw prediction instead of the 0/1 labels.
Mind this only works for Binary Logistic Regression (numClasses == 2).
I encountered a similar problem in trying to obtain the raw predictions for a multiples problem. For me, the best solution was to create a method by borrowing and customizing from the Spark MLlib Logistic Regression src. You can create a like so:
object ClassificationUtility {
def predictPoint(dataMatrix: Vector, model: LogisticRegressionModel):
(Double, Array[Double]) = {
require(dataMatrix.size == model.numFeatures)
val dataWithBiasSize: Int = model.weights.size / (model.numClasses - 1)
val weightsArray: Array[Double] = model.weights match {
case dv: DenseVector => dv.values
case _ =>
throw new IllegalArgumentException(
s"weights only supports dense vector but got type ${model.weights.getClass}.")
}
var bestClass = 0
var maxMargin = 0.0
val withBias = dataMatrix.size + 1 == dataWithBiasSize
val classProbabilities: Array[Double] = new Array[Double](model.numClasses)
(0 until model.numClasses - 1).foreach { i =>
var margin = 0.0
dataMatrix.foreachActive { (index, value) =>
if (value != 0.0) margin += value * weightsArray((i * dataWithBiasSize) + index)
}
// Intercept is required to be added into margin.
if (withBias) {
margin += weightsArray((i * dataWithBiasSize) + dataMatrix.size)
}
if (margin > maxMargin) {
maxMargin = margin
bestClass = i + 1
}
classProbabilities(i+1) = 1.0 / (1.0 + Math.exp(-(margin - maxMargin)))
}
return (bestClass.toDouble, classProbabilities)
}
}
Note it is only slightly different from the original method, it just calculates the logistic as a function of the input features. It also defines some vals and vars that are originally private and included outside of this method. Ultimately, it indexes the scores in an Array and returns it along with the best answer. I call my method like so:
// Compute raw scores on the test set.
val predictionAndLabelsAndProbabilities = test
.map { case LabeledPoint(label, features) =>
val (prediction, probabilities) = ClassificationUtility
.predictPoint(features, model)
(prediction, label, probabilities)}
However:
It seems the Spark contributors are discouraging the use of MLlib in favor of ML. The ML logistic regression API currently does not support multiples classification. I am now using OneVsRest which acts as a wrapper for one vs all classification. I am working on a similar customization to get the raw scores.
I believe the call is myModel.clearThreshold(); i.e. myModel.clearThreshold without the parentheses fails. See the linear SVM example here.