% loopStep=3000; % the steps for policy improving
% learningStep=10; % the steps of the learning of Q factors
% discountForAverage=1;
% firstEpsilon=0.3;
% secondEpsilon=0.1;
% averageQstep=0.9; % 0.9*0.7 is good
% QfactorStep=0.7;
% % firstEpsilon=0.8; % for epsilon-greedy policy, epsilon is decreasing
% % secondEpsilon=0.5;
% % epsilonRate=-log(secondEpsilon/firstEpsilon)/loopStep;
% stopEpsilon=0.1; % for another stopping criteria
% maxEqualNumber=loopStep*1;
% equalNumber=0;
% deltaLook=0.05; % the difference between the neighbour two actions
clear
global M alpha uniParameter
global arriveRate erlangRate erlangOrder
% global arriveTime lossNumber
global I e
global k1 k2 k3 k4 k5
format long;
tic;
% suppose the arrive flow is Poisson distribution, and the service or processing is erlangOrder Erlang distribution with erlangRate service rate each phase
arriveRate=1; % arrive rate of Poisson arriving flow
erlangOrder=4; % the order of Erlang distribution
erlangRate=3*2/1.5; % service rate of every phase of Erlang distribution
% serviceRate=erlangRate/erlangOrder; % average service rate
N=5; % the capacity of the reserve
M=N+1; % number of states
maxLook=1; %the max time of lookahead
minLook=0;
k1=0.1*1; % reserve cost per unit time per usable
k2=0.5*10; % service cost per unit time
k3=1/1; % waiting cost per unit time
k4=-10; % reward per processed product
k5=0.2*1; % look ahead cost per unit time
I=eye(M,M);
e=ones(M,1);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%alpha=0.01; % discount factor
alpha=0;
deltaLook=0.1; % the difference between the neighbour two actions
loopStep=1000; % the steps for policy improving
learningStep=50; % the steps of the learning of Q factors
discountForAverage=1;
firstEpsilon=0.2;
secondEpsilon=0.1;
averageQstep=0.6; % 0.9*0.6 is very good under firstEpsilon=0.2(than 0.25),better than 0.8*0.6,0.9*0.5, 0.9*0.7,1*0.6 !!sorry, it is stochastic!
QfactorStep=0.7;
firstEpsilon=0.5; % for epsilon-greedy policy, epsilon is decreasing
secondEpsilon=0.2; % denote the value of loopStep/2
epsilonRate=-2*log(secondEpsilon/firstEpsilon)/loopStep;
stopEpsilon=0.01; % for another stopping criteria
maxEqualNumber=loopStep*1;
equalNumber=0;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
x=minLook; % begin to define the discrete action set
actionSet=0; % begin to define the discrete action set. Here possibly minLook~=0.
while x<=maxLook
actionSet=[actionSet,x];
x=x+deltaLook;
end
% actionSet=[actionSet,maxLook];
actionSet=[actionSet,inf]; % including the action of state M, that is the reserve is full free, then we have to wait untill the unit arrives
actionNumber=length(actionSet);
currentState=ceil(rand(1,1)*M); % initialize state
greedyPolicy(1)=0; % initialize policy
greedyPolicy(M)=inf;
if currentState==1
actionIndex=1; % indicate the initial action
elseif currentState==M
actionIndex=actionNumber;
end
%for i=2:M-1
% j=ceil(rand(1,1)*(actionNumber-2))+1;
% greedyPolicy(i)=actionSet(j);
% if currentState==i actionIndex=j; end % in order to record the action being used
%end
greedyPolicy(2:end-1)=ones(1,M-2)*maxLook; %(maxLook-minLook)/2;
if currentState~=1¤tState~=M actionIndex=actionNumber-1; end % initialize policy
pi=[0,0.602093902620241,0.753377944704191,0.893866808528892,0.999933893038648,Inf];
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Qfactor=zeros(M,actionNumber); % initialize the value of Q, why multiply by 0.1 ?
% for state 1, the corresponding value is only (1,1), for state M, (M,actionNumber)
Qfactor(1,:)=[Qfactor(1,1),inf*ones(1,actionNumber-1)];
Qfactor(M,:)=[inf*ones(1,actionNumber-1),Qfactor(M,actionNumber)];
Qfactor(:,1)=[Qfactor(1,1);inf*ones(M-1,1)];
Qfactor(:,actionNumber)=[inf*ones(M-1,1);Qfactor(M,actionNumber)];
visitTimes=zeros(size(Qfactor)); % the visiting times of state-action tuple
[falpha,Aalpha,delayTime]=equivMarkov(greedyPolicy); % uniParameter will also be return, and delayTime is the average delay time of every state transition
[stableProb,potential]=stablePotential(falpha,Aalpha);
lastValue=falpha+Aalpha*potential; % stopping criteria value
disValue=[lastValue];
averageVector=stableProb*falpha; % to store the average cost for every learningStep
AverageEsmate=averageVector;
% arriveTime=0;
% totalCost=0; % the total cost accumulated in our simulation
% totalTime=0; % the total past time in simulation
% averageQcost=0; % the estimate of the average cost
eachTransitCost=0;
eachTransitTime=0;
for outStep=1:loopStep
if outStep<loopStep*1
epsilon=firstEpsilon;
epsilon=firstEpsilon*exp(-2*epsilonRate*outStep); % decreasing as time goes
elseif outStep<loopStep*0.8
epsilon=secondEpsilon;
else epsilon=0;
end % it can be computed by an inverse exponential function
for innerStep=1:learningStep
visitTimes(currentState,actionIndex)=visitTimes(currentState,actionIndex)+1; % the visiting numbers of state-action tuples, in order to determine the stepsize for Q learning
currentAction=greedyPolicy(currentState);
if currentState==M
lookCost=0; % indexM=0; % the cost for taking the action of looking ahead
else
lookCost=k5*currentAction; % indexM=1; % lookCost=k5*currentAction*indexM;
end
[flag,sojournTime,serveTime,nextState] = Transition(currentState,currentAction); % the most important!!!,新改?1
% totalTime=discountForAverage*totalTime+sojournTime; % ?下面计算平均代价在discountForAverage=1时,是等价的。
eachTransitTime=discountForAverage*eachTransitTime+(sojournTime-eachTransitTime)/((outStep-1)*learningStep+innerStep)^averageQstep;
endSojournTime=exp(-alpha*sojournTime);
endServeTime=exp(-alpha*serveTime);
if alpha==0
discoutTime=sojournTime;
disServTime=serveTime;
disWaitTime=sojournTime-serveTime;
else
discoutTime=(1-endSojournTime)/alpha;
disServTime=(1-endServeTime)/alpha;
disWaitTime=(endServeTime-endSojournTime)/alpha;
end
if flag==0 % which means waiting
costReal=(k1*(M-currentState)+k3)*sojournTime+lookCost; % reserve/waiting cost/the latter cost is generated immediately at current state
% totalCost=discountForAverage*totalCost+costReal;
% averageQcost=totalCost/totalTime;
% eachTransitCost=discountForAverage*eachTransitCost+(costReal-eachTransitCost)/((outStep-1)*learningStep+innerStep)^averageQstep;
% averageQcost=eachTransitCost/eachTransitTime;
% costDiscouted=(k1*(M-currentState)+k3-averageQcost)*discoutTime+lookCost;
purtCost=(k1*(M-currentState)+k3)*discoutTime+lookCost;
else % which means serving
costReal=k1*(M-currentState-1)*sojournTime+k2*serveTime+k3*(sojournTime-serveTime)+k4+lookCost; % the latter cost is generated immediately at current state
% totalCost=discountForAverage*totalCost+costReal;
% averageQcost=totalCost/totalTime;
% eachTransitCost=discountForAverage*eachTransitCost+(costReal-eachTransitCost)/((outStep-1)*learningStep+innerStep)^averageQstep;
% averageQcost=eachTransitCost/eachTransitTime;
% costDiscouted=(k1*(M-currentState-1)-averageQcost)*discoutTime+k2*disServTime+k3*disWaitTime+k4*endSojournTime+lookCost;
purtCost=k1*(M-currentState-1)*discoutTime+k2*disServTime+k3
评论0