莫烦老师走迷宫智能强化学习DQN算法实现，开箱即用

共9个文件

py：5个

checkpoint：1个

data-00000-of-00001：1个

强化学习

需积分: 12 122 浏览量 2022-12-30 15:48:48 上传评论 2 收藏 19KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

DQN.zip （9个子文件）

DQN

20221229 2 走迷宫 DQN.py 3KB

DeepQNetwork.py 12KB

maze_env.py 4KB

model

model.ckpt.data-00000-of-00001 1KB

checkpoint 77B

model.ckpt.meta 47KB

model.ckpt.index 679B

logs

20221229 1 走迷宫 DQN.py 3KB

maze_env3.py 6KB

import numpy as np #注意，NUMPY版本要兼容TF ，要1.21 pip install numpy==1.21 # 为了解决版本问题，导入tensorflow 1.x 版本。 import tensorflow.compat.v1 as tf tf.disable_v2_behavior() np.random.seed(1) # 随机数指定生成 tf.set_random_seed(1) # QLearning问题：Q-Table的大小等于状态数量*动作数量。当状态和动作数量很多的时候， # Q-Table就会变的很大，查找和存储都会消耗大量的时间和空间。 # QLearning本质上是建立了个（状态+动作）与（在该状态下执行该动作所获得最大奖励）之间的映射关系， # 抽象来看就是在输入参数和输出参数之间建立了一种映射关系，# 而这恰好就是神经网络所擅长的， # 于是自然而然的想到能不能利用神经网络来取代Q函数的计算，进一步的，我们计算出Q-Table后还需要根据Q-Table来选择当前状态下的最优动作， # 那能不能直接实现当前状态到最优动作之间的端到端映射呢？这种方法就叫做Deep Q Network。 # DQN与QLearning算法的不同点 # 用神经网络取代Q-Table来计算Q值，解决维度爆炸的问题 # 记忆库，用于存储过去的经验，记忆库有两个特点 # 1. 随机抽取，可以移除数据之间的相关性。如果我们采用的不是随机抽取，有可能我们获取到的数据都是连续相关的，那么就可能出现训练的数据都是某个状态下的数据，那么训练就会失败，通过随机抽取，可以切断这种相关性，保证训练数据样本分布的均匀。 # 2. 滚动更新，即用最新的记忆替换掉最老的记忆，因为我们的记忆库肯定是有限的，不可能无限存储记忆，因此需要对记忆进行更新，而正常的逻辑里面，新的记忆会比老的记忆更有价值（当然，这个并不是绝对的，也有可能老的记忆里面也有重要的经验，这个属于DQN后期的优化内容） # 暂时冻结 q_target 参数，让目标固定，使神经网络更好的收敛 class DeepQNetwork: # DQN类 def __init__( self, # 初始化,定义一些超参数 n_actions, # 动作的个数 n_features, # 特征的个数 learning_rate=0.01, # 学习率 reward_decay=0.9, # R折减率 e_greedy=0.9, # 贪婪度 replace_target_iter=300, # 300次迭代更新一次Q_target memory_size=500, # 记忆库的大小 batch_size=32, # 每次训练的批次大小 e_greedy_increment=None, # 贪婪度的增量 output_graph=False, # 是否输出图像 ): self.cost = [] # 记录cost的变化 self.memory_counter = 0 # 记忆库的计数器 weights = tf.Variable(tf.random_normal([5, 5], stddev=0.35), name="weights") # 5*5的卷积核 0.35是标准差生成的是正态分布 self.saver = tf.train.Saver() # 保存模型 self.n_actions = n_actions # 动作的个数 self.n_features = n_features # 特征的个数 self.lr = learning_rate # 学习率 self.gamma = reward_decay # R折减率 self.epsilon_max = e_greedy # 贪婪度 self.replace_target_iter = replace_target_iter # 300次迭代更新一次Q_target self.memory_size = memory_size # 记忆库的大小 self.batch_size = batch_size # 每次训练的批次大小 self.epsilon_increment = e_greedy_increment # 贪婪度的增量 self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max # 贪婪度 self.learn_step_counter = 0 # 记录学习的步数 self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) # 初始化记忆库 # consist of [target_net, evaluate_net] self._build_net() # 构建网络 t_params = tf.get_collection('target_net_params') # 获取target_net_params的参数 e_params = tf.get_collection('eval_net_params') # 获取eval_net_params的参数 self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] # 将e_params的参数赋值给t_params self.sess = tf.Session() # 创建会话 if output_graph: # $ tensorboard --logdir=logs tf.summary.FileWriter("logs/", self.sess.graph) self.sess.run(tf.global_variables_initializer()) # 初始化所有变量 self.cost_his = [] # 记录cost的变化 def _build_net(self): # 构建网络 self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # 输入 self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='q_target') # 损失计算 with tf.variable_scope('eval_net'): # c_names(collections_names) are the collections to store variables c_names, n_l1, w_initializer, b_initializer = \ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers # 第一层网络 # tf中矩阵加法， nxm + 1xm , 直接在nxm的矩阵每行加上1xm矩阵中的对应元素。 # relu函数：x = 0 if x<0 else x=x with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1) # 第二层网络 with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.q_eval = tf.matmul(l1, w2) + b2 with tf.variable_scope('loss'): self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) # 计算q_target与q_eval的均方误差 with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) # RMSProp算法画出 # 建立目标网络(target_net) self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # 输入 with tf.variable_scope('target_net'): # c_names(collections_names) 是用于存储变量的集合 c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] # 返回 global 变量 list # 第一层网络 with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1) # 第二层网络 with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.q_next = tf.matmul(l1, w2) + b2 # 记忆库(Experiencereplay) # 记忆库会存储过去出现的transition。如果我们设置记忆库的大小N = 500，则超过N之后， # 存入的transition会覆盖掉记忆库中最早存入的transition。 # 这样，神经网络就可以利用批学习，例如设置batch = 32，从记忆库中打乱顺序随机取出batch个transition， # 进入神经网络利用反向传播学习参数。为何要随机取transition，这是为了打乱transition之间的相关性。 def store_transition(self, s, a, r, s_): # 存

评论收藏

内容反馈