"""
Soft Actor-Critic (SAC)
------------------
Actor policy in SAC is stochastic, with off-policy training.
And 'soft' in SAC indicates the trade-off between the entropy and expected return.
The additional consideration of entropy term helps with more explorative policy.
And this implementation contains an automatic update for the entropy factor.
This version of Soft Actor-Critic (SAC) implementation contains 5 networks:
2 Q net, 2 target Q net, 1 policy net.
It uses alpha loss.
Reference
---------
paper: https://arxiv.org/pdf/1812.05905.pdf
Environment
---
Openai Gym Pendulum-v0, continuous action space
https://gym.openai.com/envs/Pendulum-v0/
Prerequisites
--------------
tensorflow >=2.0.0a0
tensorflow-probability 0.6.0
tensorlayer >=2.0.0
&&
pip install box2d box2d-kengz --user
To run
------
python tutorial_SAC.py --train/test
"""
import argparse
import os
import random
import time
import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import tensorlayer as tl
from tensorlayer.layers import Dense
from tensorlayer.models import Model
from car_env_sac_two import CarEnv
Normal = tfp.distributions.Normal
tl.logging.set_verbosity(tl.logging.DEBUG)
# add arguments in command --train/test
parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
parser.add_argument('--train', dest='train', action='store_true', default=False)
parser.add_argument('--test', dest='test', action='store_true', default=True)
args = parser.parse_args()
##################### hyper parameters ####################
ENV_ID = 'Pendulum-v0' # environment id
RANDOM_SEED = 2 # random seed
RENDER = True # render while training
# RL training
ALG_NAME = 'SAC'
TRAIN_EPISODES = 500 # total number of episodes for training
TEST_EPISODES = 1 # total number of episodes for training
MAX_STEPS = 300 # total number of steps for each episode
TEST_MAX_STEPS = 300
EXPLORE_STEPS = 100 # 500 for random action sampling in the beginning of training
BATCH_SIZE = 256 # update batch size
HIDDEN_DIM = 32 # size of hidden layers for networks
UPDATE_ITR = 3 # repeated updates for single step
SOFT_Q_LR = 3e-4 # q_net learning rate
POLICY_LR = 3e-4 # policy_net learning rate
ALPHA_LR = 3e-4 # alpha learning rate
POLICY_TARGET_UPDATE_INTERVAL = 3 # delayed update for the policy network and target networks
REWARD_SCALE = 1. # value range of reward
REPLAY_BUFFER_SIZE = 5e5 # size of the replay buffer
AUTO_ENTROPY = True # automatically updating variable alpha for entropy
############################### SAC ####################################
class ReplayBuffer:
"""
a ring buffer for storing transitions and sampling for training
:state: (state_dim,)
:action: (action_dim,)
:reward: (,), scalar
:next_state: (state_dim,)
:done: (,), scalar (0 and 1) or bool (True and False)
"""
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = int((self.position + 1) % self.capacity) # as a ring buffer
def sample(self, BATCH_SIZE):
batch = random.sample(self.buffer, BATCH_SIZE)
state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element
"""
the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
np.stack((1,2)) => array([1, 2])
"""
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
class SoftQNetwork(Model):
""" the network for evaluate values of state-action pairs: Q(s,a) """
def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3):
super(SoftQNetwork, self).__init__()
input_dim = num_inputs + num_actions
w_init = tf.keras.initializers.glorot_normal(
seed=None
) # glorot initialization is better than uniform in practice
# w_init = tf.random_uniform_initializer(-init_w, init_w)
self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
def forward(self, input):
x = self.linear1(input)
x = self.linear2(x)
x = self.linear3(x)
return x
class PolicyNetwork(Model):
""" the network for generating non-determinstic (Gaussian distributed) action from the state input """
def __init__(
self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2
):
super(PolicyNetwork, self).__init__()
self.log_std_min = log_std_min
self.log_std_max = log_std_max
w_init = tf.keras.initializers.glorot_normal(seed=None)
# w_init = tf.random_uniform_initializer(-init_w, init_w)
self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
self.mean_linear = Dense(
n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
in_channels=hidden_dim, name='policy_mean'
)
self.log_std_linear = Dense(
n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
in_channels=hidden_dim, name='policy_logstd'
)
self.action_range = action_range
self.num_actions = num_actions
def forward(self, state):
x = self.linear1(state)
x = self.linear2(x)
x = self.linear3(x)
mean = self.mean_linear(x)
log_std = self.log_std_linear(x)
log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max)
return mean, log_std
def evaluate(self, state, epsilon=1e-6):
""" generate action with state for calculating gradients """
state = state.astype(np.float32)
mean, log_std = self.forward(state)
std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow
normal = Normal(0, 1)
z = normal.sample(mean.shape)
action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick
action = self.action_range * action_0
# according to original paper, with an extra last term for normalizing different action range
log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0 ** 2 +
epsilon) - np.log(self.action_range)
# both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
# the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
# needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal.
log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced
return action, log_prob, z, mean, log_std
def get_action(self, state, greedy=False):
""" generate action with state for interaction with envronment """
mean
没有合适的资源?快使用搜索试试~ 我知道了~
资源详情
资源评论
资源推荐
收起资源包目录
强化学习.zip (3个子文件)
collision.py 2KB
car_env_sac_two.py 8KB
SAC_Two.py 21KB
共 3 条
- 1
原味小辣鸡
- 粉丝: 63
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
评论0