CSPSaQ-learningamatlab.rar_CSPS_matlab强化学习_强化学习_强化学习优化

共25个文件

m：23个

mat：2个

版权申诉

csps

强化学习

70 浏览量 2022-07-15 10:21:02 上传评论收藏 382KB RAR 举报

资源详情

资源评论

资源推荐

收起资源包目录

CSPSaQ-learningamatlab.rar （25个子文件）

新建文件夹 (2)

qlearning

qlearninga.mat 364KB

equivMarkov.m 1KB

fPlusAg.m 221B

stablePotential.m 340B

averageDelay.m 940B

costMatrix.m 1KB

embedTranMatrix.m 396B

hQalphaV.m 2KB

serveErlang.m 2KB

erlangValue.m 231B

Transition.m 972B

halpha0V.m 548B

MAINSmallQLnew.m 10KB

computation

equivMarkov.m 2KB

delayTestMAIN.m 2KB

fPlusAg.m 237B

computationK1K2K3K5n3.mat 3KB

stablePotential.m 359B

averageDelay.m 946B

costMatrix.m 1KB

embedTranMatrix.m 414B

hQalphaV.m 2KB

kenelMatrix.m 258B

MAINSmallComputation.m 4KB

halpha0V.m 582B

% loopStep=3000; % the steps for policy improving % learningStep=10; % the steps of the learning of Q factors % discountForAverage=1; % firstEpsilon=0.3; % secondEpsilon=0.1; % averageQstep=0.9; % 0.9*0.7 is good % QfactorStep=0.7; % % firstEpsilon=0.8; % for epsilon-greedy policy, epsilon is decreasing % % secondEpsilon=0.5; % % epsilonRate=-log(secondEpsilon/firstEpsilon)/loopStep; % stopEpsilon=0.1; % for another stopping criteria % maxEqualNumber=loopStep*1; % equalNumber=0; % deltaLook=0.05; % the difference between the neighbour two actions clear global M alpha uniParameter global arriveRate erlangRate erlangOrder % global arriveTime lossNumber global I e global k1 k2 k3 k4 k5 format long; tic; % suppose the arrive flow is Poisson distribution, and the service or processing is erlangOrder Erlang distribution with erlangRate service rate each phase arriveRate=1; % arrive rate of Poisson arriving flow erlangOrder=4; % the order of Erlang distribution erlangRate=3*2/1.5; % service rate of every phase of Erlang distribution % serviceRate=erlangRate/erlangOrder; % average service rate N=5; % the capacity of the reserve M=N+1; % number of states maxLook=1; %the max time of lookahead minLook=0; k1=0.1*1; % reserve cost per unit time per usable k2=0.5*10; % service cost per unit time k3=1/1; % waiting cost per unit time k4=-10; % reward per processed product k5=0.2*1; % look ahead cost per unit time I=eye(M,M); e=ones(M,1); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %alpha=0.01; % discount factor alpha=0; deltaLook=0.1; % the difference between the neighbour two actions loopStep=1000; % the steps for policy improving learningStep=50; % the steps of the learning of Q factors discountForAverage=1; firstEpsilon=0.2; secondEpsilon=0.1; averageQstep=0.6; % 0.9*0.6 is very good under firstEpsilon=0.2(than 0.25),better than 0.8*0.6,0.9*0.5, 0.9*0.7,1*0.6 !!sorry, it is stochastic! QfactorStep=0.7; firstEpsilon=0.5; % for epsilon-greedy policy, epsilon is decreasing secondEpsilon=0.2; % denote the value of loopStep/2 epsilonRate=-2*log(secondEpsilon/firstEpsilon)/loopStep; stopEpsilon=0.01; % for another stopping criteria maxEqualNumber=loopStep*1; equalNumber=0; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% x=minLook; % begin to define the discrete action set actionSet=0; % begin to define the discrete action set. Here possibly minLook~=0. while x<=maxLook actionSet=[actionSet,x]; x=x+deltaLook; end % actionSet=[actionSet,maxLook]; actionSet=[actionSet,inf]; % including the action of state M, that is the reserve is full free, then we have to wait untill the unit arrives actionNumber=length(actionSet); currentState=ceil(rand(1,1)*M); % initialize state greedyPolicy(1)=0; % initialize policy greedyPolicy(M)=inf; if currentState==1 actionIndex=1; % indicate the initial action elseif currentState==M actionIndex=actionNumber; end %for i=2:M-1 % j=ceil(rand(1,1)*(actionNumber-2))+1; % greedyPolicy(i)=actionSet(j); % if currentState==i actionIndex=j; end % in order to record the action being used %end greedyPolicy(2:end-1)=ones(1,M-2)*maxLook; %(maxLook-minLook)/2; if currentState~=1&currentState~=M actionIndex=actionNumber-1; end % initialize policy pi=[0,0.602093902620241,0.753377944704191,0.893866808528892,0.999933893038648,Inf]; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Qfactor=zeros(M,actionNumber); % initialize the value of Q, why multiply by 0.1 ? % for state 1, the corresponding value is only (1,1), for state M, (M,actionNumber) Qfactor(1,:)=[Qfactor(1,1),inf*ones(1,actionNumber-1)]; Qfactor(M,:)=[inf*ones(1,actionNumber-1),Qfactor(M,actionNumber)]; Qfactor(:,1)=[Qfactor(1,1);inf*ones(M-1,1)]; Qfactor(:,actionNumber)=[inf*ones(M-1,1);Qfactor(M,actionNumber)]; visitTimes=zeros(size(Qfactor)); % the visiting times of state-action tuple [falpha,Aalpha,delayTime]=equivMarkov(greedyPolicy); % uniParameter will also be return, and delayTime is the average delay time of every state transition [stableProb,potential]=stablePotential(falpha,Aalpha); lastValue=falpha+Aalpha*potential; % stopping criteria value disValue=[lastValue]; averageVector=stableProb*falpha; % to store the average cost for every learningStep AverageEsmate=averageVector; % arriveTime=0; % totalCost=0; % the total cost accumulated in our simulation % totalTime=0; % the total past time in simulation % averageQcost=0; % the estimate of the average cost eachTransitCost=0; eachTransitTime=0; for outStep=1:loopStep if outStep<loopStep*1 epsilon=firstEpsilon; epsilon=firstEpsilon*exp(-2*epsilonRate*outStep); % decreasing as time goes elseif outStep<loopStep*0.8 epsilon=secondEpsilon; else epsilon=0; end % it can be computed by an inverse exponential function for innerStep=1:learningStep visitTimes(currentState,actionIndex)=visitTimes(currentState,actionIndex)+1; % the visiting numbers of state-action tuples, in order to determine the stepsize for Q learning currentAction=greedyPolicy(currentState); if currentState==M lookCost=0; % indexM=0; % the cost for taking the action of looking ahead else lookCost=k5*currentAction; % indexM=1; % lookCost=k5*currentAction*indexM; end [flag,sojournTime,serveTime,nextState] = Transition(currentState,currentAction); % the most important!!!,新改?1 % totalTime=discountForAverage*totalTime+sojournTime; % ?下面计算平均代价在discountForAverage＝1时，是等价的。 eachTransitTime=discountForAverage*eachTransitTime+(sojournTime-eachTransitTime)/((outStep-1)*learningStep+innerStep)^averageQstep; endSojournTime=exp(-alpha*sojournTime); endServeTime=exp(-alpha*serveTime); if alpha==0 discoutTime=sojournTime; disServTime=serveTime; disWaitTime=sojournTime-serveTime; else discoutTime=(1-endSojournTime)/alpha; disServTime=(1-endServeTime)/alpha; disWaitTime=(endServeTime-endSojournTime)/alpha; end if flag==0 % which means waiting costReal=(k1*(M-currentState)+k3)*sojournTime+lookCost; % reserve/waiting cost/the latter cost is generated immediately at current state % totalCost=discountForAverage*totalCost+costReal; % averageQcost=totalCost/totalTime; % eachTransitCost=discountForAverage*eachTransitCost+(costReal-eachTransitCost)/((outStep-1)*learningStep+innerStep)^averageQstep; % averageQcost=eachTransitCost/eachTransitTime; % costDiscouted=(k1*(M-currentState)+k3-averageQcost)*discoutTime+lookCost; purtCost=(k1*(M-currentState)+k3)*discoutTime+lookCost; else % which means serving costReal=k1*(M-currentState-1)*sojournTime+k2*serveTime+k3*(sojournTime-serveTime)+k4+lookCost; % the latter cost is generated immediately at current state % totalCost=discountForAverage*totalCost+costReal; % averageQcost=totalCost/totalTime; % eachTransitCost=discountForAverage*eachTransitCost+(costReal-eachTransitCost)/((outStep-1)*learningStep+innerStep)^averageQstep; % averageQcost=eachTransitCost/eachTransitTime; % costDiscouted=(k1*(M-currentState-1)-averageQcost)*discoutTime+k2*disServTime+k3*disWaitTime+k4*endSojournTime+lookCost; purtCost=k1*(M-currentState-1)*discoutTime+k2*disServTime+k3