基于强化学习A3C与DQN的月球着陆游戏训练设计与实现

共3个文件

py：3个

版权申诉

强化学习

137 浏览量 2022-04-16 22:28:21 上传评论收藏 8KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于强化学习A3C与DQN的月球着陆游戏训练设计与实现.rar （3个子文件）

基于强化学习A3C与DQN的月球着陆游戏训练设计与实现

run_LunarLander.py 2KB

DuelingDQNPrioritizedReplay.py 13KB

A3C.py 9KB

""" The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952) View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ Using: Tensorflow: 1.0 """ import numpy as np import tensorflow as tf np.random.seed(1) tf.set_random_seed(1) class SumTree(object): """ This SumTree code is modified version and the original code is from: https://github.com/jaara/AI-blog/blob/master/SumTree.py Story the data with it priority in tree and data frameworks. """ data_pointer = 0 def __init__(self, capacity): self.capacity = capacity # for all priority values self.tree = np.zeros(2 * capacity - 1) # [--------------Parent nodes-------------][-------leaves to recode priority-------] # size: capacity - 1 size: capacity self.data = np.zeros(capacity, dtype=object) # for all transitions # [--------------data frame-------------] # size: capacity def add_new_priority(self, p, data): leaf_idx = self.data_pointer + self.capacity - 1 self.data[self.data_pointer] = data # update data_frame self.update(leaf_idx, p) # update tree_frame self.data_pointer += 1 if self.data_pointer >= self.capacity: # replace when exceed the capacity self.data_pointer = 0 def update(self, tree_idx, p): change = p - self.tree[tree_idx] self.tree[tree_idx] = p self._propagate_change(tree_idx, change) def _propagate_change(self, tree_idx, change): """change the sum of priority value in all parent nodes""" parent_idx = (tree_idx - 1) // 2 self.tree[parent_idx] += change if parent_idx != 0: self._propagate_change(parent_idx, change) def get_leaf(self, lower_bound): leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound data_idx = leaf_idx - self.capacity + 1 return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]] def _retrieve(self, lower_bound, parent_idx=0): """ Tree structure and array storage: Tree index: 0 -> storing priority sum / \ 1 2 / \ / \ 3 4 5 6 -> storing priority for transitions Array type for storing: [0,1,2,3,4,5,6] """ left_child_idx = 2 * parent_idx + 1 right_child_idx = left_child_idx + 1 if left_child_idx >= len(self.tree): # end search when no more child return parent_idx if self.tree[left_child_idx] == self.tree[right_child_idx]: return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node return self._retrieve(lower_bound, left_child_idx) else: return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx) @property def root_priority(self): return self.tree[0] # the root class Memory(object): # stored as ( s, a, r, s_ ) in SumTree """ This SumTree code is modified version and the original code is from: https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py """ epsilon = 0.001 # small amount to avoid zero priority alpha = 0.6 # [0~1] convert the importance of TD error to priority beta = 0.4 # importance-sampling, from initial value increasing to 1 beta_increment_per_sampling = 1e-4 # annealing the bias abs_err_upper = 1 # for stability refer to paper def __init__(self, capacity): self.tree = SumTree(capacity) def store(self, error, transition): p = self._get_priority(error) self.tree.add_new_priority(p, transition) def sample(self, n): batch_idx, batch_memory, ISWeights = [], [], [] segment = self.tree.root_priority / n self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1 min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights for i in range(n): a = segment * i b = segment * (i + 1) lower_bound = np.random.uniform(a, b) idx, p, data = self.tree.get_leaf(lower_bound) prob = p / self.tree.root_priority ISWeights.append(self.tree.capacity * prob) batch_idx.append(idx) batch_memory.append(data) ISWeights = np.vstack(ISWeights) ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize return batch_idx, np.vstack(batch_memory), ISWeights def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def _get_priority(self, error): error += self.epsilon # avoid 0 clipped_error = np.clip(error, 0, self.abs_err_upper) return np.power(clipped_error, self.alpha) class DuelingDQNPrioritizedReplay: def __init__( self, n_actions, n_features, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9, replace_target_iter=500, memory_size=10000, batch_size=32, e_greedy_increment=None, hidden=[100, 50], output_graph=False, sess=None, ): self.n_actions = n_actions self.n_features = n_features self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.hidden = hidden self.epsilon_increment = e_greedy_increment self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max self.learn_step_counter = 0 self._build_net() self.memory = Memory(capacity=memory_size) if sess is None: self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) else: self.sess = sess if output_graph: tf.summary.FileWriter("logs/", self.sess.graph) self.cost_his = [] def _build_net(self): def build_layers(s, c_names, w_initializer, b_initializer): for i, h in enumerate(self.hidden): if i == 0: in_units, out_units, inputs = self.n_features, self.hidden[i], s else: in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l with tf.variable_scope('l%i' % i): w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names) b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names) l = tf.nn.relu(tf.matmul(inputs, w) + b) with tf.variable_scope('Value'): w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names) b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names) self.V = tf.matmul(l, w) + b with tf.variable_scope('Advantage'): w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names) b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.A = tf.matmul(l, w) + b with tf.variable_scope('Q'): out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a) # with tf.variable_scope('out')

评论收藏

内容反馈

版权申诉