clc;
close all;
clear all;
%probabilities of actions
a=0.9;
b=0.05
%state
S=1:16;
goal=16;
bad_state=10;
initial_state=5;
s_init=initial_state;
%Reward matrix
R=ones(16,16);
R=R*nan;
R(1,2)=-1, R(1,5)=-1, R(2,1)=-1, R(2,3)=-1, R(2,6)=-1, R(3,2)=-1, R(3,4)=-1, R(3,7)=-1, R(4,3)=-1, R(4,8)=-1,
R(5,1)=-1,R(5,6)=-1,R(5,9)=-1,R(6,5)=-1,R(6,2)=-1,R(6,7)=-1,R(6,10)=-1,R(7,6)=-1,R(7,3)=-1,R(7,11)=-1,
R(7,8)=-1,R(8,7)=-1,R(8,4)=-1,R(8,12)=-1,R(9,5)=-1,R(9,10)=-1,R(9,13)=-1,R(10,9)=-1,R(10,6)=-1,
R(10,11)=-1,R(10,14)=-1,R(11,10)=-1,R(11,7)=-1,R(11,15)=-1,R(11,12)=-1,R(12,11)=-1,R(12,8)=-1,R(12,16)=-1,
R(13,9)=-1,R(13,14)=-1,R(14,13)=-1,R(14,10)=-1,R(14,15)=-1,R(15,11)=-1,R(15,11)=-1,R(16,15)=-1,
R(16,12)=-1,R(1,5)=0,R(9,5)=0,R(6,5)=0,R(6,10)=-70,R(9,10)=-70,R(14,10)=-70,R(11,10)=-70,R(12,16)=100,
R(15,16)=100;
%action-value matrix
Q=zeros(16,16);
%count of each action from all states N(s,a)
N=zeros(16,16);
%hyperparameter
gamma=0.99;
epsilon1=0.05;
epsilon2=0.2;
bad_count=0;
episodes=1000;
avg_reward_sarsa=zeros(1,episodes);
iter=50
states_visited_eps=zeros(1,episodes);
for runs=1:iter
episode_count=0;
for episode=(1:episodes)
episode_count=episode_count+1;
episode_reward=0;
states_visited=0;
s_init=5;
action_sarsa=0;
possible_actions_sarsa=[];
for i=1:16
if (~isnan(R(s_init,i)))
possible_actions_sarsa=[possible_actions_sarsa,i];
end
end
l_sarsa=length(possible_actions_sarsa);
q_val_sarsa=[];
rnd=0.01*(randi(101)-1);
if(rnd>epsilon1)
for i=1:l_sarsa
q_val_sarsa=[q_val_sarsa,Q(s_init,possible_actions_sarsa(i))];
end
[m,index]=max(q_val_sarsa);
action_sarsa=possible_actions_sarsa(index);
else
index=randi(l_sarsa);
action_sarsa=possible_actions_sarsa(index);
end
while(s_init~=goal)
s_init
states_visited=states_visited+1
if(s_init==10)
break;
end
possible_actions_i=[];
i_state=action_sarsa;
for i=1:16
if (~isnan(R(i_state,i)))
possible_actions_i=[possible_actions_i,i];
end
end
action_i=possible_actions_i;
l1=length(possible_actions_i)
q_val=[];
rnd=0.01*(randi(101)-1);
if(rnd>epsilon1)
for i=1:l1
q_val=[q_val,Q(i_state,possible_actions_i(i))];
end
[m,index]=max(q_val);
action_next_state=possible_actions_i(index);
else
index=randi(l1);
action_next_state=possible_actions_i(index);
end
N(s_init,i_state)= N(s_init,i_state)+1;
alpha=1/N(s_init,i_state);
Q(s_init,i_state)=Q(s_init,i_state)+alpha*(R(s_init,i_state)+gamma*Q(i_state,action_next_state)-Q(s_init,i_state))
episode_reward=episode_reward+R(s_init,i_state);
s_init=i_state;
action_sarsa=action_next_state
end
states_visited_eps(1,episode_count)=states_visited_eps(1,episode_count)+states_visited;
avg_reward_sarsa(1,episode_count)=avg_reward_sarsa(1,episode_count)+episode_reward;
end
end
avg_reward_sarsa=avg_reward_sarsa/iter;
states_visited_eps=states_visited_eps/iter;
avg_reward_sarsa2=zeros(1,episodes);
states_visited_eps2=zeros(1,episodes);
for runs=1:iter
episode_count=0;
for episode=(1:episodes)
episode_count=episode_count+1;
episode_reward=0;
states_visited=0;
s_init=5;
action_sarsa=0;
possible_actions_sarsa=[];
for i=1:16
if (~isnan(R(s_init,i)))
possible_actions_sarsa=[possible_actions_sarsa,i];
end
end
l_sarsa=length(possible_actions_sarsa);
q_val_sarsa=[];
rnd=0.01*(randi(101)-1);
if(rnd>epsilon1)
for i=1:l_sarsa
q_val_sarsa=[q_val_sarsa,Q(s_init,possible_actions_sarsa(i))];
end
[m,index]=max(q_val_sarsa);
action_sarsa=possible_actions_sarsa(index);
else
index=randi(l_sarsa);
action_sarsa=possible_actions_sarsa(index);
end
while(s_init~=goal)
s_init
states_visited=states_visited+1
if(s_init==10)
break;
end
possible_actions_i=[];
i_state=action_sarsa;
for i=1:16
if (~isnan(R(i_state,i)))
possible_actions_i=[possible_actions_i,i];
end
end
action_i=possible_actions_i;
l1=length(possible_actions_i)
q_val=[];
rnd=0.01*(randi(101)-1);
if(rnd>epsilon2)
for i=1:l1
q_val=[q_val,Q(i_state,possible_actions_i(i))];
end
[m,index]=max(q_val);
action_next_state=possible_actions_i(index);
else
index=randi(l1);
action_next_state=possible_actions_i(index);
end
N(s_init,i_state)= N(s_init,i_state)+1;
alpha=1/N(s_init,i_state);
Q(s_init,i_state)=Q(s_init,i_state)+alpha*(R(s_init,i_state)+gamma*Q(i_state,action_next_state)-Q(s_init,i_state))
episode_reward=episode_reward+R(s_init,i_state);
s_init=i_state;
action_sarsa=action_next_state
end
states_visited_eps2(1,episode_count)=states_visited_eps2(1,episode_count)+states_visited;
avg_reward_sarsa2(1,episode_count)=avg_reward_sarsa2(1,episode_count)+episode_reward;
end
end
avg_reward_sarsa2=avg_reward_sarsa2/iter;
states_visited_eps2=states_visited_eps2/iter;
figure;
plot(avg_reward_sarsa)
hold on;
plot(avg_reward_sarsa2)
hold off;
xlabel('Number of Episodes')
ylabel('Average reward per episode')
legend('epsilon=0.05','epsilon=0.2')
title('Impact of epsilon on rate of convergence')
saveas(gcf,'sarsa_avg_reward_2.png')
figure;
plot(states_visited_eps)
hold on;
plot(states_visited_eps2)
hold off;
xlabel('Number of Episodes')
ylabel('Average number of states per episode')
legend('epsilon=0.05','epsilon=0.2')
title('Impact of epsilon on rate of convergence')
saveas(gcf,'sarsa_state_visited_2.png')
% plot(avg_reward_sarsa)
% figure;
% plot(states_visited_eps)
%
%
%
% avg_reward_eps10=[]
% states_visited_eps10=[]
% s=[],s_count=0;
% s_states=[]
% for i=1:episodes
% i
% s_count=s_count+1;
%
% s=[s,avg_reward_sarsa(1,i)];
% s_states=[s_states,states_visited_eps(1,i)]
% if(s_count==20)
% s_count=0;
% f1=mean(s)
% avg_reward_eps10=[avg_reward_eps10,f1];
%
% f2=mean(s_states)
% states_visited_eps10=[states_visited_eps10,f2];
% s=[];
% s_states=[];
% end
% end
% figure;
% plot(avg_reward_eps10)
% ylim=([1 100])
% figure;
% plot(states_visited_eps10)
% ylim=([1 100])

matlab科研社
- 粉丝: 3w+
- 资源: 3078
最新资源
- 金属熔化凝固及多技术下的UDF代码讲解:最强学习资料,深入探索:金属熔凝最强学习资料汇编-涵盖流动传热传质、激光移动热源技术、熔化凝固现象、宏观偏析现象解析及UDF代码精细讲解,fluent金属熔凝
- 船舶电力系统稳定性分析与仿真的Python实现及优化调度(论文复现,含可运行代码及解释)
- 基于Python的渐开线直齿轮动力学与弹流润滑耦合模型仿真及不确定性和敏感性分析
- 基于FPGA的数字电子琴-数电小系统设计【数字电子技术】(使用Vivado中的verilog语言)实验设计代码文件(全)
- 格子玻尔兹曼方法LBM在Matlab中模拟三维多孔介质流动D3Q19的实践研究,格子玻尔兹曼方法LBM在Matlab中模拟三维多孔介质流动D3Q19的实践研究,使用格子玻尔兹曼方法LBM模拟三维多孔介
- 基于COMSOL的三维离散裂隙恒压注浆模型:粘度空间衰减的宾汉姆流体随机圆盘模型研究,基于宾汉姆流体粘度空间衰减的COMSOL三维离散裂隙恒压注浆模型研究,COMSOL 三维离散裂隙注浆模型 基于粘
- 基于双闭环控制与最近电平逼近调制的MMC模块化多电平换流器仿真研究:含技术文档、Matlab-Simulink实现、直流侧11kV交流侧6.6kV电压电流稳态对称仿真分析,基于双闭环控制与最近电平逼近
- chatbox软件压缩包
- 汇川小型机H5U编程特色:伺服驱动器与IO模块配置,梯形图编程直观易懂,卷绕轴速调整与多配方处理,产能统计及数据交互功能强大 ,汇川小型机H5U编程特色:伺服驱动器与IO模块配置,梯形图编程直观易懂
- 磁悬浮轴承MATLAB建模与仿真分析:理论与实践的完美结合,磁悬浮轴承与磁悬浮仿真模型的MATLAB建模与仿真分析,磁悬浮轴承MATLAB模型&磁悬浮仿真模型 ,磁悬浮轴承; MATLAB模型; 磁悬
- 人工智能之深度学习:神经网络分类与回归任务实战资源(源码+数据集+说明)
- numpy-2.2.0-cp312-cp312-win_amd64.whl
- 微信疯传社群测试可用+修复版+免激活去授权版
- 项目实战:Qt图像拼接渐进色图层生成工具v1.0.0(预设四路拼接,多路可通过多图叠层实现)
- numpy-2.2.0-cp312-cp312-win32.whl
- 智能算法在21个工程应用中的全面实践与效能分析:统计指标导出与收敛图可视化 这个标题符合您的要求,既包含了您提供的文字中的主要信息,又简洁明了,易于理解 ,智能算法在21个工程应用中的全面实践与效能
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈


