clear all
%初始化参数
lr_Q=1e-3;
lr_P=1e-4;
lr_BN=1e-3;
tao=0.001;
epsilon=1e-8;
beta1=0.9;
beta2=0.999;
alpha=0.3;
gamma=0.999;
num_Q_nodes=[3;100;200;1]; %定义Q网络的结构:层数、每层节点数
num_P_nodes=[2;100;200;1]; %定义P网络的结构:层数、每层节点数
flag_done=0; %epoch结束标志
a_estimate=[];
a_new_estimate=[];
plot_x=[];
plot_a=[];
plot_Loss_a=[];
plot_v=[];
plot_wP=[];
plot_wQ=[];
plot_bP=[];
plot_bQ=[];
plot_error=[];
plot_reward=[];
plot_Loss=[];
Q=[];
plot_Q=[];
Q_new=[];
reset=0;
total_reward=0; %累计奖励
r=0; %当前奖励
s=0;
v=1*(2*rand()-1); %速度
x=1*(2*rand()-1); %位移
s_new=0;
x_new=0;
a=0;
node_input_Pnet_batch={}; %用于存放a网络各层网络的输出,行:各层输出,列:各batch
node_input_Qnet_batch={}; %用于存放q网络各层网络的输出,行:各层输出,列:各batch
size_batch=100; %样本大小
size_ex=10000;
epoch=200; %每回合步数
episode=1000; %回合数
step=0;
count=0;
count_Q=0;
hiden_node_type='leakyrelu';
out_node_type='tanh';
ex=[];
%初始化网络
[w_Q, b_Q, node_type_Q] = init_a_net(num_Q_nodes,hiden_node_type,out_node_type);
[w_Q2, b_Q2, node_type_Q2] = init_a_net(num_Q_nodes,hiden_node_type,out_node_type);
w_Q_target=w_Q;
b_Q_target=b_Q;
w_Q_target2=w_Q2;
b_Q_target2=b_Q2;
[w_P, b_P, node_type_P] = init_a_net(num_P_nodes,hiden_node_type,out_node_type);
w_P_target=w_P;
b_P_target=b_P;
%初始化adam算法相关量
m_w_Q=init_cell(w_Q,'zero');
m_w_Q2=init_cell(w_Q2,'zero');
v_w_Q=init_cell(w_Q,'zero');
v_w_Q2=init_cell(w_Q2,'zero');
m_w_P=init_cell(w_P,'zero');
v_w_P=init_cell(w_P,'zero');
m_b_Q=init_cell(b_Q,'zero');
m_b_Q2=init_cell(b_Q2,'zero');
v_b_Q=init_cell(b_Q,'zero');
v_b_Q2=init_cell(b_Q2,'zero');
m_b_P=init_cell(b_P,'zero');
v_b_P=init_cell(b_P,'zero');
%初始化BN算法相关
bn_gamma_Q=init_cell(b_Q,'one');
bn_beta_Q=init_cell(b_Q,'zero');
bn_gamma_Q2=init_cell(b_Q2,'one');
bn_beta_Q2=init_cell(b_Q2,'zero');
bn_gamma_Q_target=init_cell(b_Q,'one');
bn_beta_Q_target=init_cell(b_Q,'zero');
bn_gamma_Q2_target=init_cell(b_Q2,'one');
bn_beta_Q2_target=init_cell(b_Q2,'zero');
bn_gamma_P_target=init_cell(b_P,'one');
bn_beta_P_target=init_cell(b_P,'zero');
bn_gamma_P=init_cell(b_P,'one');
bn_beta_P=init_cell(b_P,'zero');
bn_mu_P_average=init_cell(b_P,'zero');
bn_sigma_P_average=init_cell(b_P,'one');
m_gamma_Q=init_cell(bn_gamma_Q,'zero');
v_gamma_Q=init_cell(bn_gamma_Q,'zero');
m_gamma_Q2=init_cell(bn_gamma_Q2,'zero');
v_gamma_Q2=init_cell(bn_gamma_Q2,'zero');
m_beta_Q=init_cell(bn_beta_Q,'zero');
v_beta_Q=init_cell(bn_beta_Q,'zero');
m_beta_Q2=init_cell(bn_beta_Q,'zero');
v_beta_Q2=init_cell(bn_beta_Q,'zero');
m_gamma_P=init_cell(bn_gamma_P,'zero');
v_gamma_P=init_cell(bn_gamma_P,'zero');
m_beta_P=init_cell(bn_beta_P,'zero');
v_beta_P=init_cell(bn_beta_P,'zero');
bn_mu_Q_average=init_cell(b_Q,'zero');
bn_sigma_Q_average=init_cell(b_Q,'one');
bn_mu_Q2_average=init_cell(b_Q2,'zero');
bn_sigma_Q2_average=init_cell(b_Q2,'one');
flag_bn=0;
flag_train=1;
plot_prob_P=[];
plot_prob_Q=[];
%训练
for k=1:episode
reset=1;
x=1*(2*rand()-1);
v=1*(2*rand()-1);
s=[x;v];
total_reward=0;
total_q=0;
for i=1:epoch
%初始化
a_estimate=[];
a_new_estimate=[];
Q=[];
Q_new=[];
q_policy=[];
%选择行为:
[a] = batch_norm_predict(w_P,b_P,s',1,node_type_P,bn_gamma_P,bn_beta_P,bn_mu_P_average,bn_sigma_P_average,flag_bn,0);
[Q1] = batch_norm_predict(w_Q,b_Q,[s' a],1,node_type_Q,bn_gamma_Q,bn_beta_Q,bn_mu_Q_average,bn_sigma_Q_average,flag_bn,0);
[Q2] = batch_norm_predict(w_Q2,b_Q2,[s' a],1,node_type_Q2,bn_gamma_Q2,bn_beta_Q2,bn_mu_Q2_average,bn_sigma_Q2_average,flag_bn,0);
Q=min(Q1,Q2);
a=a.*(abs(a)<=1)+(abs(a)>1);
%行为加入随机性,并限定范围-1~1,这里是异策略,行为策略不同于目标策略P网络,是在P的输出之上加入了一个随机量
a = a + alpha*randn();
if alpha>0
alpha=alpha-0.0001;
end
%更新环境
s_new = DoubleIntegrateModel_UpdateEnvonrement(a,x,v,reset );
reset=0;
%获取奖励
r = DoubleIntegrateModel_Reward(s_new(1,1),s_new(2,1),a);
if abs(s(1,1))>1
flag_done=1;
r=-1;
end
plot_Q=[plot_Q Q];
plot_reward=[plot_reward;r];
total_reward = total_reward + r;
plot_x=[plot_x x];
plot_a=[plot_a a];
plot_v=[plot_v v];
plot_wP=[plot_wP w_P{2,1}(1,1)];
plot_wQ=[plot_wQ w_Q{2,1}(1,1)];
plot_bP=[plot_bP b_P{2,1}(1,1)];
plot_bQ=[plot_bQ b_Q{2,1}(1,1)];
if (norm(s,1)<0.01)
break
end
%保存经验
ex = add_experience(s,a,r,s_new,flag_done,ex,size_ex);
x=s_new(1,1);
v=s_new(2,1);
s=s_new;
if flag_done
x_new=0;
flag_done=0;
reset=1;
a=0;
s = DoubleIntegrateModel_UpdateEnvonrement(a,x,v,reset );
reset=0;
x=s(1,1);
v=s(2,1);
break
end
if step<=2*size_batch
total_q=0;
step=step+1;
continue
end
%从经验中抽样
sample = sample_experience(ex,size_batch);
sample_s = sample(1:2,:)';
sample_a = sample(3,:)';
sample_r = sample(4,:)';
sample_s_new = sample(5:6,:)';
sample_flag_done = sample(7,:)';
%预测样本:
%行为a(t+1)的估计:这里要用经验中的x+1来预测
[a_new_estimate,~,bn_mu_P,bn_sigma_P] = batch_norm_predict(w_P_target,b_P_target,sample_s_new,size_batch,node_type_P,bn_gamma_P_target,bn_beta_P_target,bn_mu_P_average,bn_sigma_P_average,flag_bn,flag_train);
%加入噪声。原文中提到,基于假设:“相似的行为具有相似的Q值”。那么在a的领域内去估计价值TD目标Y,可以减少Y的方差。
a_new_estimate = a_new_estimate + 0.1*randn(size(a_new_estimate));
a_new_estimate=a_new_estimate.*(abs(a_new_estimate)<=1)+(abs(a_new_estimate)>1);
%动作价值q(t):这里要用经验中的a来预测
[Q, node_input_Qnet_batch,bn_mu_Q,bn_sigma_Q,prob_Q] = batch_norm_predict(w_Q,b_Q,[sample_s sample_a],size_batch,node_type_Q,bn_gamma_Q,bn_beta_Q,bn_mu_Q_average,bn_sigma_Q_average,flag_bn,flag_train);
[q2, node_input_qnet_batch2,bn_mu_Q2,bn_sigma_Q2] = batch_norm_predict(w_Q2,b_Q2,[sample_s sample_a],size_batch,node_type_Q2,bn_gamma_Q2,bn_beta_Q2,bn_mu_Q2_average,bn_sigma_Q2_average,flag_bn,flag_train);
bn_mu_Q_average=mul_cell(add_cell(mul_cell(bn_mu_Q_average,count),bn_mu_Q),1/(count_Q+1));
bn_sigma_Q_average=mul_cell(add_cell(mul_cell(bn_sigma_Q_average,count),bn_sigma_Q),1/(count_Q+1));
bn_mu_Q2_average=mul_cell(add_cell(mul_cell(bn_mu_Q2_average,count),bn_mu_Q2),1/(count_Q+1));
bn_sigma_Q2_average=mul_cell(add_cell(mul_cell(bn_sigma_Q2_average,count),bn_sigma_Q2),1/(count_Q+1));
count_Q=count_Q+1;
%动作价值q(t+1):这里要用估计出来的x+1来预测
[Q_new, ] = batch_norm_predict(w_Q_target,b_Q_target,[sample_s_new a_new_estimate],size_batch,node_type_Q,bn_gamma_Q,bn_beta_Q,bn_mu_Q_average,bn_sigma_Q_average,flag_bn,flag_train);
[Q2_new, ] = batch_norm_predict(w_Q_target2,b_Q_target2,[sample_s_new a_new_estimate],size_batch,node_type_Q2,bn_gamma_Q2,bn_beta_Q2,bn_mu_Q2_average,bn_sigma_Q2_average,flag_bn,flag_train);
total_q=total_q+sum(min(Q_new,Q2_new))/size_batch;
%TD目标
TD_target = sample_r + gamma * min(Q_new,Q2_new) .* (1-sample_flag_done); %对
没有合适的资源?快使用搜索试试~ 我知道了~
资源详情
资源评论
资源推荐
收起资源包目录
TD3.rar (24个子文件)
net_fp.m 282B
sgdupdate_net.m 138B
init_a_net.m 1KB
sigmoid.m 47B
fp.m 739B
DoubleIntegrateModel_UpdateEnvonrement.m 618B
init_cell.m 613B
main.m 13KB
batch_norm_bp.m 4KB
batch_norm_predict.m 4KB
sample_experience.m 196B
adamupdate_net.m 852B
node_function_derivative.m 535B
sum_cell.m 114B
leaky_relu.m 87B
add_experience.m 620B
DoubleIntegrateModel_Reward.m 239B
mul_cell.m 111B
add_cell.m 131B
example_function_fitting.m 3KB
map.m 229B
loss_function.m 784B
loss_function_derivation.m 417B
LMbp.m 712B
共 24 条
- 1
Vladimirptb
- 粉丝: 5
- 资源: 4
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
评论5