import numpy as np
import datetime
import random
# 定义环境 m * n方格
width = 7
height = 7
# 定义贪婪系数 学习率 折扣因数
epsilon = 0.7
alpha = 0.3
gamma = 0.99
# 最大遍历次数
max_episodes = 2000
# 起始位置坐标
start_x = 3 # int((width-1) / 2)
start_y = 3 # int((height-1) / 2)
# 判断代理于第几次到达某点 记录到达的点数
point_num_flag = [1, 2, 3, 4]
point_num = []
# 动作空间:0 1 2 3 上下左右
# 获取奖励
def get_back_reward(x, y):
if x == (width - 2) and y == (height - 2) and point_num_flag[0] == 1:
point_num_flag[0] = 0
print("代理到达A点[5,5]")
number = 1
point_num.append(1)
elif x == 1 and y == (height - 2) and point_num_flag[1] == 2:
point_num_flag[1] = 0
print("代理到达B点[1,5]")
number = 1
point_num.append(2)
elif x == 1 and y == 1 and point_num_flag[2] == 3:
point_num_flag[2] = 0
print("代理到达C点[1,1]")
number = 1
point_num.append(3)
elif x == 5 and y == 1 and point_num_flag[3] == 4:
point_num_flag[3] = 0
print("代理到达D点[5,1]")
number = 1
point_num.append(4)
else:
number = 0
point = set(point_num)
if len(point) == 4:
number = 3
return number
# 根据当前位置和动作得到下一个位置
def get_next_state(x, y, action):