【强化学习自动驾驶】使用SAC算法同时控制转向和速度

共3个文件

py：3个

强化学习

自动驾驶

需积分: 26 106 浏览量 2020-12-12 21:36:45 上传评论 8 收藏 9KB ZIP 举报

资源详情

资源评论

资源推荐

收起资源包目录

强化学习.zip （3个子文件）

collision.py 2KB

car_env_sac_two.py 8KB

SAC_Two.py 21KB

""" Soft Actor-Critic (SAC) ------------------ Actor policy in SAC is stochastic, with off-policy training. And 'soft' in SAC indicates the trade-off between the entropy and expected return. The additional consideration of entropy term helps with more explorative policy. And this implementation contains an automatic update for the entropy factor. This version of Soft Actor-Critic (SAC) implementation contains 5 networks: 2 Q net, 2 target Q net, 1 policy net. It uses alpha loss. Reference --------- paper: https://arxiv.org/pdf/1812.05905.pdf Environment --- Openai Gym Pendulum-v0, continuous action space https://gym.openai.com/envs/Pendulum-v0/ Prerequisites -------------- tensorflow >=2.0.0a0 tensorflow-probability 0.6.0 tensorlayer >=2.0.0 && pip install box2d box2d-kengz --user To run ------ python tutorial_SAC.py --train/test """ import argparse import os import random import time import gym import matplotlib.pyplot as plt import numpy as np import tensorflow as tf import tensorflow_probability as tfp import tensorlayer as tl from tensorlayer.layers import Dense from tensorlayer.models import Model from car_env_sac_two import CarEnv Normal = tfp.distributions.Normal tl.logging.set_verbosity(tl.logging.DEBUG) # add arguments in command --train/test parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=True) args = parser.parse_args() ##################### hyper parameters #################### ENV_ID = 'Pendulum-v0' # environment id RANDOM_SEED = 2 # random seed RENDER = True # render while training # RL training ALG_NAME = 'SAC' TRAIN_EPISODES = 500 # total number of episodes for training TEST_EPISODES = 1 # total number of episodes for training MAX_STEPS = 300 # total number of steps for each episode TEST_MAX_STEPS = 300 EXPLORE_STEPS = 100 # 500 for random action sampling in the beginning of training BATCH_SIZE = 256 # update batch size HIDDEN_DIM = 32 # size of hidden layers for networks UPDATE_ITR = 3 # repeated updates for single step SOFT_Q_LR = 3e-4 # q_net learning rate POLICY_LR = 3e-4 # policy_net learning rate ALPHA_LR = 3e-4 # alpha learning rate POLICY_TARGET_UPDATE_INTERVAL = 3 # delayed update for the policy network and target networks REWARD_SCALE = 1. # value range of reward REPLAY_BUFFER_SIZE = 5e5 # size of the replay buffer AUTO_ENTROPY = True # automatically updating variable alpha for entropy ############################### SAC #################################### class ReplayBuffer: """ a ring buffer for storing transitions and sampling for training :state: (state_dim,) :action: (action_dim,) :reward: (,), scalar :next_state: (state_dim,) :done: (,), scalar (0 and 1) or bool (True and False) """ def __init__(self, capacity): self.capacity = capacity self.buffer = [] self.position = 0 def push(self, state, action, reward, next_state, done): if len(self.buffer) < self.capacity: self.buffer.append(None) self.buffer[self.position] = (state, action, reward, next_state, done) self.position = int((self.position + 1) % self.capacity) # as a ring buffer def sample(self, BATCH_SIZE): batch = random.sample(self.buffer, BATCH_SIZE) state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element """ the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; np.stack((1,2)) => array([1, 2]) """ return state, action, reward, next_state, done def __len__(self): return len(self.buffer) class SoftQNetwork(Model): """ the network for evaluate values of state-action pairs: Q(s,a) """ def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): super(SoftQNetwork, self).__init__() input_dim = num_inputs + num_actions w_init = tf.keras.initializers.glorot_normal( seed=None ) # glorot initialization is better than uniform in practice # w_init = tf.random_uniform_initializer(-init_w, init_w) self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') def forward(self, input): x = self.linear1(input) x = self.linear2(x) x = self.linear3(x) return x class PolicyNetwork(Model): """ the network for generating non-determinstic (Gaussian distributed) action from the state input """ def __init__( self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2 ): super(PolicyNetwork, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max w_init = tf.keras.initializers.glorot_normal(seed=None) # w_init = tf.random_uniform_initializer(-init_w, init_w) self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') self.mean_linear = Dense( n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_mean' ) self.log_std_linear = Dense( n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_logstd' ) self.action_range = action_range self.num_actions = num_actions def forward(self, state): x = self.linear1(state) x = self.linear2(x) x = self.linear3(x) mean = self.mean_linear(x) log_std = self.log_std_linear(x) log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max) return mean, log_std def evaluate(self, state, epsilon=1e-6): """ generate action with state for calculating gradients """ state = state.astype(np.float32) mean, log_std = self.forward(state) std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow normal = Normal(0, 1) z = normal.sample(mean.shape) action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick action = self.action_range * action_0 # according to original paper, with an extra last term for normalizing different action range log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0 ** 2 + epsilon) - np.log(self.action_range) # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced return action, log_prob, z, mean, log_std def get_action(self, state, greedy=False): """ generate action with state for interaction with envronment """ mean