Source code for nnabla_rl.distributions.softmax

# Copyright 2020,2021 Sony Corporation.
# Copyright 2021,2022,2023 Sony Group Corporation.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

import nnabla as nn
import nnabla.functions as NF
import nnabla_rl.functions as RF
from nnabla_rl.distributions import DiscreteDistribution

[docs]class Softmax(DiscreteDistribution): """Softmax distribution which samples a class index :math:`i` according to the following probability. :math:`i \\sim \\frac{\\exp{z_{i}}}{\\sum_{j}\\exp{z_{j}}}`. Args: z (nn.Variable): logits :math:`z`. Logits' dimension should be same as the number of class to sample. """ def __init__(self, z): super(Softmax, self).__init__() if not isinstance(z, nn.Variable): z = nn.Variable.from_numpy_array(z) self._distribution = NF.softmax(x=z, axis=len(z.shape) - 1) self._log_distribution = NF.log_softmax(x=z, axis=len(z.shape) - 1) self._batch_size = z.shape[0] self._num_class = z.shape[-1] labels = np.array( [label for label in range(self._num_class)], dtype=np.int32) self._labels = nn.Variable.from_numpy_array(labels) self._actions = self._labels for size in reversed(z.shape[0:-1]): self._actions = NF.stack(*[self._actions for _ in range(size)]) @property def ndim(self): return 1
[docs] def sample(self, noise_clip=None): # NOTE: nnabla's random_choice backpropagetes through distribution return NF.random_choice(self._actions, w=self._distribution)
[docs] def sample_multiple(self, num_samples, noise_clip=None): raise NotImplementedError
[docs] def sample_and_compute_log_prob(self, noise_clip=None): # NOTE: nnabla's random_choice backpropagetes through distribution sample = NF.random_choice(self._actions, w=self._distribution) log_prob = self.log_prob(sample) return sample, log_prob
[docs] def choose_probable(self): # NOTE: nnabla's argmax backpropagetes through distribution return RF.argmax(self._distribution, axis=len(self._distribution.shape) - 1)
[docs] def mean(self): raise NotImplementedError
[docs] def log_prob(self, x): one_hot_action = NF.one_hot(x, shape=(self._num_class, )) return NF.sum(self._log_distribution * one_hot_action, axis=len(self._distribution.shape) - 1, keepdims=True)
[docs] def entropy(self): plogp = self._distribution * self._log_distribution return -NF.sum(plogp, axis=len(plogp.shape) - 1, keepdims=True)
[docs] def kl_divergence(self, q): if not isinstance(q, Softmax): raise ValueError("Invalid q to compute kl divergence") return NF.sum(self._distribution * (self._log_distribution - q._log_distribution), axis=len(self._distribution.shape) - 1, keepdims=True)