pythonq-learning_qlearning算法代码资源-CSDN文库

共88个文件

py：66个

sh：19个

readme：1个

q-learning

4星 · 超过85%的资源需积分: 50 71 浏览量 2017-09-05 15:57:44 上传评论收藏 780KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

Q-Learning-in-Python-master.zip （88个子文件）

Q-Learning-in-Python-master

src

RL.sh 1KB

QL-interval.sh 866B

QLearning.py 4KB

RL-QL.sh 893B

PRQL-interval.sh 929B

tools

plotting

plotOptimization.py 629B

plota27a31.py 821B

plota90a100.py 4KB

plotaTest.py 713B

plota56a66_aux.py 4KB

plotaPs.py 1KB

plota165a166.py 650B

plota134a138.py 684B

plota167.py 806B

plotErrorBar.py 793B

plotaErro.py 593B

plota78.py 740B

plota163a164.py 650B

plota32a36.py 916B

plota37a41.py 916B

plota169.py 814B

plota101a111.py 4KB

plotaComErro.py 569B

plotaOBixin.py 394B

plota79a89.py 4KB

plotaProbVsDet.py 523B

plota168.py 806B

plota56a66.aux.py 4KB

plota51a55.py 821B

plota112a122.py 4KB

plota67a77.py 4KB

plota56a66.py 4KB

plota123a133.py 4KB

plota151a160.py 2KB

tailFiles.sh 980B

descompactaIns.sh 64B

graph2transitions.py 611B

prodCart3.py 266B

transitionsProb2graph.py 686B

spreadsheet2MDP.py 4KB

compactaArquivos.sh 220B

separaMDPs.py 3KB

Danny

OOo

__init__.py 0B

OOoLib.py 10KB

__init__.py 0B

separaMDPs-2.py 1KB

erasePolicy.py 1KB

policy2spreadsheet.py 1KB

paintCells.py 1KB

prepareFolders.sh 798B

graph2actions.py 505B

solveMDPinSpreadSheet00.sh 327B

plota173.py 2KB

meanError.validation.ods 737KB

plota174.py 601B

compactaArquivosSecundarios.sh 221B

plota172.py 725B

solveMDPinSpreadSheet.sh 324B

prodCart.py 275B

descompacta.sh 414B

prodCart2.py 266B

descompactaW_avg.sh 493B

spreadsheet2MDP.old.py 5KB

plota79a89.py 4KB

plotaProbVsDet.py 523B

apagaCelulasEmBranco.py 2KB

plotaPdePsi.py 1KB

delOutputFiles.sh 188B

delBackupFiles.sh 92B

plota56a66.aux.py 4KB

juntaPoliticas.sh 585B

plota56a66.py 4KB

changeExtension.sh 77B

tailRL.sh 476B

spreadsheet2MDP-testesImprime.py 741B

mediaDeMedias.py 2KB

plota175.py 600B

QabLearning.py 2KB

RL-QL.py 4KB

RL-PRQL.sh 1KB

prepareFolders.py 302B

PRQLearning.py 11KB

MDP.py 4KB

Agent.py 3KB

RL-PRQL.py 6KB

meanError.py 1KB

.gitignore 118B

README 0B

#!/usr/bin/env python2 import QLearning import operator import csv from math import exp from random import random from numpy import cumsum from os import listdir import sys class PRQLearning: def __init__(self, \ MDP, \ Agent, \ alpha, \ gamma, \ epsilon, \ epsilonIncrement, \ K, \ H, \ gammaPRQL, tau, \ deltaTau, \ psi, \ v, \ filePath): self.MDP = MDP self.Agent = Agent self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.epsilonIncrement = epsilonIncrement self.K = K self.H = H self.gammaPRQL = gammaPRQL self.tau = tau self.deltaTau = deltaTau self.psi = psi self.v = v self.filePath = filePath self.L = self.loadPolicies() self.myQLearning = None def execute(self): # Initialize # For each state-action pair (s, a), initialize the table # entry Q_omega(s, a) to zero The W_omega from the article is # W[omega], with omega == 0, and the same applies to U_omega # TODO: inverter 2 proximas linhas L = self.loadPolicies() Q_omega, W, U = self.initializeQ_omegaWU() myQLearning = self.initializeQLearning(Q = Q_omega) self.myQLearning = myQLearning # TODO: colocar isso de log em uma funcao a parte logStuff = [] logStuffTitle = [] logStuffTitle.append('iteration') logStuffTitle.append('state') logStuffTitle.append('psi') logStuffTitle.append('randomNumber1') logStuffTitle.append('randomNumber2') logStuffTitle.append('epsilon') logStuffTitle.append('a') logStuffTitle.append('r') logStuffTitle.append('W') logStuff.append(logStuffTitle) # used for log purposes # TODO: give better names to these lists # they're gonna be collections of the values W_avg_list = [] # list with average W from each episode Ws = [] Ps = [] # list containing the P from each episode Ks = [] # list containing the chosen policy from each episode PRvsQLs = [] # contains the quantity of episodes in which policy reuse (PR) and QLearning (QL) were used output = {'W_avg_list' : W_avg_list, 'Ws' : Ws, 'Ps' : Ps, 'Ks' : Ks, 'PRvsQL' : PRvsQLs} w_avg = 0.0 # average cummulative reward received (independently of the policy used) pr = 0; ql = 0 for episode in range(self.K): # Assign to each policy the probability of being selected P = self.assignProbsToPolicies(W, self.tau) Ps.append(P) # Choose an action policy PI_k k = self.choosePolicy(P) Ks.append([k]) # Execute the learning episode k # Receive R and the updated Q function (Q_omega) if k == 0: # If PI_k == PI_omega then execute QLearning # We are going to dump Ws_dump because the QLearning # algorithm runs only once, therefore the returned Ws # contains only the R itself (appended below in Ws[]) R, Ws_dump = myQLearning.execute() ql += 1 else: # Else, use the pi-reuse strategy to reuse PI_k # chamar funcao pi_reuse() Pi_past = L[k] R = self.pi_reuse(Pi_past, 1, self.H, self.psi, self.v, logStuff, Q_omega) pr += 1 W[k] = ( (W[k] * U[k]) + R ) / ( U[k] + 1 ) Ws.append(W[:]) U[k] = U[k] + 1 # TODO: verificar isso self.myQLearning.epsilon = self.myQLearning.epsilon + \ self.myQLearning.epsilonIncrement self.tau = self.tau + self.deltaTau w_avg = ( (w_avg * episode) + R ) / ( episode + 1 ) W_avg_list.append([w_avg]) print 'pr: ', pr, 'ql: ', ql; sys.stdout.flush() PRvsQLs.append([pr, ql]) f=open(self.filePath + '/log.txt', 'w') wr = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC) wr.writerows(logStuff) f.close() return output def pi_reuse(self, Pi_past, K, H, psi, v, logStuff, Q_pi_new = None): Q_pi_new = self.initializeQ_pi_new(Q_pi_new) randomNumber1 = -1 randomNumber2 = -1 epsilon = -1 W = 0 numLine = 0 for k in range(K): # Set the initial state, s, randomly self.Agent.setInitialState() #self.Agent.state = '1' psi = self.psi for h in range(1, H + 1): s = self.Agent.state logStuffLine = [] logStuffLine.append(len(logStuff) + 1) logStuffLine.append(s) # if a goal state is reached the episode ends if s in self.MDP.G: break randomNumber1 = random() if randomNumber1 <= psi: # With a probability of psi, use the policy from the library (a = Pi_past(s)) a = self.Agent.selectBestAction(s, source = 'Probabilistic Policy', Pi = Pi_past) else: # With a probability of (1 - psi), a = epsilon_greedy(PI_new(s)) randomNumber2 = random() epsilon = 1 - psi if randomNumber2 <= epsilon: # greedy a = self.Agent.selectBestAction(s, source = 'Q-Table', Q = Q_pi_new) else: #random a = self.Agent.selectRandomAction() # Execute action # TODO: modificar para r_{k, h} # Receive the next state s' and reward r_k_h s2, r = self.Agent.executeAction(a) # TODO: manter um vetor V com os maximos maxValue = -1.0 for a2 in self.MDP.A: if Q_pi_new[s2][a2] > maxValue: maxValue = Q_pi_new[s2][a2] # Update Q_pi_new(s, a), and therefore, PI_new Q_pi_new[s][a] = ((1.0 - self.alpha) * Q_pi_new[s][a]) + \ self.alpha * (r + self.gamma * maxValue) # Set psi_(h + 1) = psi_h * v psi = psi * v # accumulate reward on W W = float(W) + pow(self.gamma, h) * r # Set s = s' self.Agent.state = s2 # prepare stuff to do the log logStuffLine.append(psi) logStuffLine.append(float(randomNumber1)) logStuffLine.append(float(randomNumber2)) logStuffLine.append(epsilon) logStuffLine.append(a) logStuffLine.append(r) logStuffLine.append(W) logStuff.append(logStuffLine) # Update W according to the formula W = float(W) / float(K) return W def assignProbsToPolicies(self, W, tau): n = len(W) denominator = 0 for p in range(n): denominator = denominator + exp(tau * W[p]) P = [] for j in range(n): p = exp(tau * W[j]) / denominator P.append(p) return P def choosePolicy(self, P): P = cumsum(P) randomNumber = random() k = 0 for i in range(len(P)): # FIXME: menor estrito, pois random.random() gera numero

评论收藏

内容反馈