Skip to content

swarmrl.value_functions.generalized_advantage_estimate Module API Reference

Module for the expected returns value function.

GAE

Class for the expected returns.

Source code in swarmrl/value_functions/generalized_advantage_estimate.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class GAE:
    """
    Class for the expected returns.
    """

    def __init__(self, gamma: float = 0.99, lambda_: float = 0.95):
        """
        Constructor for the generalized advantage estimate  class

        Parameters
        ----------
        gamma : float
                A decay factor for the values of the task each time step.
        lambda_ : float
                A decay factor that describes the amount of bias included in the
                advantage calculation.

        Notes
        -----
        See https://arxiv.org/pdf/1506.02438.pdf for more information.
        """
        self.gamma = gamma
        self.lambda_ = lambda_

        # Set by us to stabilize division operations.
        self.eps = np.finfo(np.float32).eps.item()

    @partial(jit, static_argnums=(0,))
    def __call__(self, rewards: np.ndarray, values: np.ndarray):
        """
        Call function for the advantage.
        Parameters
        ----------
        rewards : np.ndarray (n_time_steps, n_particles)
                A numpy array of rewards to use in the calculation.
        values : np.ndarray (n_time_steps, n_particles)
                The prediction of the critic for the episode.
        Returns
        -------
        advantages : np.ndarray (n_time_steps, n_particles)
                Expected returns for the rewards.
        """
        gae = 0
        advantages = np.zeros_like(rewards)
        for t in reversed(range(len(rewards))):
            if t != len(rewards) - 1:
                delta = rewards[t] + self.gamma * values[t + 1] - values[t]
            else:
                delta = rewards[t] - values[t]

            gae = delta + self.gamma * self.lambda_ * gae
            advantages = advantages.at[t].set(gae)

        returns = advantages + values

        advantages = (advantages - np.mean(advantages)) / (
            np.std(advantages) + self.eps
        )
        return advantages, returns

__call__(rewards, values)

Call function for the advantage. Parameters


rewards : np.ndarray (n_time_steps, n_particles) A numpy array of rewards to use in the calculation. values : np.ndarray (n_time_steps, n_particles) The prediction of the critic for the episode. Returns


advantages : np.ndarray (n_time_steps, n_particles) Expected returns for the rewards.

Source code in swarmrl/value_functions/generalized_advantage_estimate.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@partial(jit, static_argnums=(0,))
def __call__(self, rewards: np.ndarray, values: np.ndarray):
    """
    Call function for the advantage.
    Parameters
    ----------
    rewards : np.ndarray (n_time_steps, n_particles)
            A numpy array of rewards to use in the calculation.
    values : np.ndarray (n_time_steps, n_particles)
            The prediction of the critic for the episode.
    Returns
    -------
    advantages : np.ndarray (n_time_steps, n_particles)
            Expected returns for the rewards.
    """
    gae = 0
    advantages = np.zeros_like(rewards)
    for t in reversed(range(len(rewards))):
        if t != len(rewards) - 1:
            delta = rewards[t] + self.gamma * values[t + 1] - values[t]
        else:
            delta = rewards[t] - values[t]

        gae = delta + self.gamma * self.lambda_ * gae
        advantages = advantages.at[t].set(gae)

    returns = advantages + values

    advantages = (advantages - np.mean(advantages)) / (
        np.std(advantages) + self.eps
    )
    return advantages, returns

__init__(gamma=0.99, lambda_=0.95)

Constructor for the generalized advantage estimate class

Parameters

gamma : float A decay factor for the values of the task each time step. lambda_ : float A decay factor that describes the amount of bias included in the advantage calculation.

Notes

See https://arxiv.org/pdf/1506.02438.pdf for more information.

Source code in swarmrl/value_functions/generalized_advantage_estimate.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(self, gamma: float = 0.99, lambda_: float = 0.95):
    """
    Constructor for the generalized advantage estimate  class

    Parameters
    ----------
    gamma : float
            A decay factor for the values of the task each time step.
    lambda_ : float
            A decay factor that describes the amount of bias included in the
            advantage calculation.

    Notes
    -----
    See https://arxiv.org/pdf/1506.02438.pdf for more information.
    """
    self.gamma = gamma
    self.lambda_ = lambda_

    # Set by us to stabilize division operations.
    self.eps = np.finfo(np.float32).eps.item()