import numpy as np
# R matrix
R = np.matrix([ [-1,-1,-1,-1,0,-1],
[-1,-1,-1,0,-1,100],
[-1,-1,-1,0,-1,-1],
[-1,0,0,-1,0,-1],
[-1,0,0,-1,-1,100],
[-1,0,-1,-1,0,100] ])
# Q matrix
Q = np.matrix(np.zeros([6,6]))
# Gamma (learning parameter).
gamma = 0.8
# Initial state. (Usually to be chosen at random)
initial_state = 1
# This function returns all available actions in the state given as an argument
def available_actions(state):
current_state_row = R[state,]
av_act = np.where(current_state_row >= 0)[1]
return av_act
# Get available actions in the current state
available_act = available_actions(initial_state)
# This function chooses at random which action to be performed within the range
# of all the available actions.
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_act,1))
return next_action
# Sample next action to be performed
action = sample_next_action(available_act)
# This function updates the Q matrix according to the path selected and the Q
# learning algorithm
def update(current_state, action, gamma):
max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size = 1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
# Q learning formula
Q[current_state, action] = R[current_state, action] + gamma * max_value
# Update Q matrix
update(initial_state,action,gamma)
#-------------------------------------------------------------------------------
# Training
# Train over 10 000 iterations. (Re-iterate the process above).
for i in range(10000):
current_state = np.random.randint(0, int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_action(available_act)
update(current_state,action,gamma)
# Normalize the "trained" Q matrix
print("Trained Q matrix:")
print(Q/np.max(Q)*100)
#-------------------------------------------------------------------------------
# Testing
# Goal state = 5
# Best sequence path starting from 2 -> 2, 3, 1, 5
current_state = 2
steps = [current_state]
while current_state != 5:
next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size = 1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
# Print selected sequence of steps
print("Selected path:")
print(steps)
HUAWEI(华为) atlas 200 DK环境配置方法、网络配置、例程速跑及强化学习示例昇腾代码仓使用MobaXterm
需积分: 5 24 浏览量
2022-03-29
13:09:02
上传
评论 5
收藏 2.95MB ZIP 举报
thepresentthepresent
- 粉丝: 1
- 资源: 4
最新资源
- WANGSHANGYINHANG-4.2.9.031406-android
- docker&docker-compose离线安装包(centos)
- 滑动窗口是一种流量控制技术,用于在数据传输过程中进行拥塞控制和流量调节4.txt
- Nacos如何支持服务发现和注册-基于词频统计的分析.txt
- :基于java打造的深度学习框架,帮助你快速搭建神经网络,实现模型推理与训练,引擎支持自动求导,多线程与GPU运算
- 第九次作业(XY图,XY图显示,三维曲面,数字波形图)
- 微信小程序实战案例:打造高效便捷的在线书店.zip
- 1.0.5win(1)(1).exe
- ESP8266 WiFi模块入门教程:从连接到配置.zip
- 词频统计:从基础到实践的应用指南.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
评论0