#!/usr/bin/env python2
import QLearning
import operator
import csv
from math import exp
from random import random
from numpy import cumsum
from os import listdir
import sys
class PRQLearning:
def __init__(self, \
MDP, \
Agent, \
alpha, \
gamma, \
epsilon, \
epsilonIncrement, \
K, \
H, \
gammaPRQL,
tau, \
deltaTau, \
psi, \
v, \
filePath):
self.MDP = MDP
self.Agent = Agent
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.epsilonIncrement = epsilonIncrement
self.K = K
self.H = H
self.gammaPRQL = gammaPRQL
self.tau = tau
self.deltaTau = deltaTau
self.psi = psi
self.v = v
self.filePath = filePath
self.L = self.loadPolicies()
self.myQLearning = None
def execute(self):
# Initialize
# For each state-action pair (s, a), initialize the table
# entry Q_omega(s, a) to zero The W_omega from the article is
# W[omega], with omega == 0, and the same applies to U_omega
# TODO: inverter 2 proximas linhas
L = self.loadPolicies()
Q_omega, W, U = self.initializeQ_omegaWU()
myQLearning = self.initializeQLearning(Q = Q_omega)
self.myQLearning = myQLearning
# TODO: colocar isso de log em uma funcao a parte
logStuff = []
logStuffTitle = []
logStuffTitle.append('iteration')
logStuffTitle.append('state')
logStuffTitle.append('psi')
logStuffTitle.append('randomNumber1')
logStuffTitle.append('randomNumber2')
logStuffTitle.append('epsilon')
logStuffTitle.append('a')
logStuffTitle.append('r')
logStuffTitle.append('W')
logStuff.append(logStuffTitle)
# used for log purposes
# TODO: give better names to these lists
# they're gonna be collections of the values
W_avg_list = [] # list with average W from each episode
Ws = []
Ps = [] # list containing the P from each episode
Ks = [] # list containing the chosen policy from each episode
PRvsQLs = [] # contains the quantity of episodes in which policy reuse (PR) and QLearning (QL) were used
output = {'W_avg_list' : W_avg_list, 'Ws' : Ws, 'Ps' : Ps, 'Ks' : Ks, 'PRvsQL' : PRvsQLs}
w_avg = 0.0 # average cummulative reward received (independently of the policy used)
pr = 0; ql = 0
for episode in range(self.K):
# Assign to each policy the probability of being selected
P = self.assignProbsToPolicies(W, self.tau)
Ps.append(P)
# Choose an action policy PI_k
k = self.choosePolicy(P)
Ks.append([k])
# Execute the learning episode k
# Receive R and the updated Q function (Q_omega)
if k == 0:
# If PI_k == PI_omega then execute QLearning
# We are going to dump Ws_dump because the QLearning
# algorithm runs only once, therefore the returned Ws
# contains only the R itself (appended below in Ws[])
R, Ws_dump = myQLearning.execute()
ql += 1
else:
# Else, use the pi-reuse strategy to reuse PI_k
# chamar funcao pi_reuse()
Pi_past = L[k]
R = self.pi_reuse(Pi_past, 1, self.H, self.psi, self.v, logStuff, Q_omega)
pr += 1
W[k] = ( (W[k] * U[k]) + R ) / ( U[k] + 1 )
Ws.append(W[:])
U[k] = U[k] + 1
# TODO: verificar isso
self.myQLearning.epsilon = self.myQLearning.epsilon + \
self.myQLearning.epsilonIncrement
self.tau = self.tau + self.deltaTau
w_avg = ( (w_avg * episode) + R ) / ( episode + 1 )
W_avg_list.append([w_avg])
print 'pr: ', pr, 'ql: ', ql; sys.stdout.flush()
PRvsQLs.append([pr, ql])
f=open(self.filePath + '/log.txt', 'w')
wr = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
wr.writerows(logStuff)
f.close()
return output
def pi_reuse(self,
Pi_past,
K,
H,
psi,
v,
logStuff,
Q_pi_new = None):
Q_pi_new = self.initializeQ_pi_new(Q_pi_new)
randomNumber1 = -1
randomNumber2 = -1
epsilon = -1
W = 0
numLine = 0
for k in range(K):
# Set the initial state, s, randomly
self.Agent.setInitialState()
#self.Agent.state = '1'
psi = self.psi
for h in range(1, H + 1):
s = self.Agent.state
logStuffLine = []
logStuffLine.append(len(logStuff) + 1)
logStuffLine.append(s)
# if a goal state is reached the episode ends
if s in self.MDP.G: break
randomNumber1 = random()
if randomNumber1 <= psi:
# With a probability of psi, use the policy from the library (a = Pi_past(s))
a = self.Agent.selectBestAction(s, source = 'Probabilistic Policy', Pi = Pi_past)
else:
# With a probability of (1 - psi), a = epsilon_greedy(PI_new(s))
randomNumber2 = random()
epsilon = 1 - psi
if randomNumber2 <= epsilon:
# greedy
a = self.Agent.selectBestAction(s, source = 'Q-Table', Q = Q_pi_new)
else:
#random
a = self.Agent.selectRandomAction()
# Execute action
# TODO: modificar para r_{k, h}
# Receive the next state s' and reward r_k_h
s2, r = self.Agent.executeAction(a)
# TODO: manter um vetor V com os maximos
maxValue = -1.0
for a2 in self.MDP.A:
if Q_pi_new[s2][a2] > maxValue:
maxValue = Q_pi_new[s2][a2]
# Update Q_pi_new(s, a), and therefore, PI_new
Q_pi_new[s][a] = ((1.0 - self.alpha) * Q_pi_new[s][a]) + \
self.alpha * (r + self.gamma * maxValue)
# Set psi_(h + 1) = psi_h * v
psi = psi * v
# accumulate reward on W
W = float(W) + pow(self.gamma, h) * r
# Set s = s'
self.Agent.state = s2
# prepare stuff to do the log
logStuffLine.append(psi)
logStuffLine.append(float(randomNumber1))
logStuffLine.append(float(randomNumber2))
logStuffLine.append(epsilon)
logStuffLine.append(a)
logStuffLine.append(r)
logStuffLine.append(W)
logStuff.append(logStuffLine)
# Update W according to the formula
W = float(W) / float(K)
return W
def assignProbsToPolicies(self, W, tau):
n = len(W)
denominator = 0
for p in range(n):
denominator = denominator + exp(tau * W[p])
P = []
for j in range(n):
p = exp(tau * W[j]) / denominator
P.append(p)
return P
def choosePolicy(self, P):
P = cumsum(P)
randomNumber = random()
k = 0
for i in range(len(P)):
# FIXME: menor estrito, pois random.random() gera numero
没有合适的资源?快使用搜索试试~ 我知道了~
python q-learning
共88个文件
py:66个
sh:19个
readme:1个
4星 · 超过85%的资源 需积分: 50 74 下载量 71 浏览量
2017-09-05
15:57:44
上传
评论
收藏 780KB ZIP 举报
温馨提示
一个用python语言来实现的 q-learning实例,供学习参考。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
资源推荐
资源详情
资源评论
收起资源包目录
Q-Learning-in-Python-master.zip (88个子文件)
Q-Learning-in-Python-master
src
RL.sh 1KB
QL-interval.sh 866B
QLearning.py 4KB
RL-QL.sh 893B
PRQL-interval.sh 929B
tools
plotting
plotOptimization.py 629B
plota27a31.py 821B
plota90a100.py 4KB
plotaTest.py 713B
plota56a66_aux.py 4KB
plotaPs.py 1KB
plota165a166.py 650B
plota134a138.py 684B
plota167.py 806B
plotErrorBar.py 793B
plotaErro.py 593B
plota78.py 740B
plota163a164.py 650B
plota32a36.py 916B
plota37a41.py 916B
plota169.py 814B
plota101a111.py 4KB
plotaComErro.py 569B
plotaOBixin.py 394B
plota79a89.py 4KB
plotaProbVsDet.py 523B
plota168.py 806B
plota56a66.aux.py 4KB
plota51a55.py 821B
plota112a122.py 4KB
plota67a77.py 4KB
plota56a66.py 4KB
plota123a133.py 4KB
plota151a160.py 2KB
tailFiles.sh 980B
descompactaIns.sh 64B
graph2transitions.py 611B
prodCart3.py 266B
transitionsProb2graph.py 686B
spreadsheet2MDP.py 4KB
compactaArquivos.sh 220B
separaMDPs.py 3KB
Danny
OOo
__init__.py 0B
OOoLib.py 10KB
__init__.py 0B
separaMDPs-2.py 1KB
erasePolicy.py 1KB
policy2spreadsheet.py 1KB
paintCells.py 1KB
prepareFolders.sh 798B
graph2actions.py 505B
solveMDPinSpreadSheet00.sh 327B
plota173.py 2KB
meanError.validation.ods 737KB
plota174.py 601B
compactaArquivosSecundarios.sh 221B
plota172.py 725B
solveMDPinSpreadSheet.sh 324B
prodCart.py 275B
descompacta.sh 414B
prodCart2.py 266B
descompactaW_avg.sh 493B
spreadsheet2MDP.old.py 5KB
plota79a89.py 4KB
plotaProbVsDet.py 523B
apagaCelulasEmBranco.py 2KB
plotaPdePsi.py 1KB
delOutputFiles.sh 188B
delBackupFiles.sh 92B
plota56a66.aux.py 4KB
juntaPoliticas.sh 585B
plota56a66.py 4KB
changeExtension.sh 77B
tailRL.sh 476B
spreadsheet2MDP-testesImprime.py 741B
mediaDeMedias.py 2KB
plota175.py 600B
QabLearning.py 2KB
RL-QL.py 4KB
RL-PRQL.sh 1KB
prepareFolders.py 302B
PRQLearning.py 11KB
MDP.py 4KB
Agent.py 3KB
RL-PRQL.py 6KB
meanError.py 1KB
.gitignore 118B
README 0B
共 88 条
- 1
资源评论
- TonyDDDDD2018-02-05还行吧,网上也能找到。
- qq_345169762018-11-24再次下载我再看看
- lzllzl20132018-04-01不错不错的
qq_36795693
- 粉丝: 0
- 资源: 1
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功