setwd("E:/statistical genetics/homework")
#the Poplar data
data1<-read.csv("hw3 poplar_marker.csv",sep = ",",header=T)
data2<-t(data1[which(as.character(data1[,3])=="D10"),-c(1:4)])
data3<-read.csv("hw3-poplar_diameter.csv",sep=",",header=T)
data_old<-cbind(data2,data3[,12])
data0<-sort(data1[which(as.character(data1[,3])=="D10"),2])
data<-data_old[-which(data_old[,9]<0),]
M<-data[,1:8];Y<-data[,9]
nm<-dim(M);n<-length(Y)
#----------Class1=11, Class2=10,Class3=01,Class4=00
cl<-array(1,c(nm[1],(nm[2]-1)));nc<-dim(cl)
logL0<-array(0,c(1,nc[2]))
for(i in 1:nc[1]){
for(j in 1:nc[2]){
c1<-1*(M[i,j]==1)*(M[i,(j+1)]==1);
c2<-2*(M[i,j]==1)*(M[i,(j+1)]==2);
c3<-3*(M[i,j]==2)*(M[i,(j+1)]==1);
c4<-4*(M[i,j]==2)*(M[i,(j+1)]==2);
cl[i,j]<-c1+c2+c3+c4
}
}
for(n in 1:nc[2]){
al<-cbind(cl,Y);bl<-al[which(al[,n]!=0),]
mY<-mean(bl[,8]);sY<-sd(bl[,8])
L0<-sum(dnorm(bl[,8],mY,sY,log=T));logL0[1,n]<-L0
}
#----------Define Functions for P1 and P2#QTL基因型的条件概率
P1<-function(y,u1,u2,w,v){
temp1<-(1-v)*dnorm(y,mean=u1,sd=w)
temp2<-v*dnorm(y,mean=u2,sd=w)
return(temp1/(temp1+temp2))
}
P2<-function(y,u1,u2,w,v){
temp1<-v*dnorm(y,mean=u1,sd=w)
temp2<-(1-v)*dnorm(y,mean=u2,sd=w)
return(temp1/(temp1+temp2))
}
#-------------Define log likelihood---------------
logL<-function(y1,y2,y3,y4,u1,u2,w,v){
S1<-sum(dnorm(y1,u1,w,log=T))
S2<-sum((1-v)*dnorm(y2,u1,w,log=T)+v*dnorm(y2,u2,w,log=T))
S3<-sum(v*dnorm(y3,u1,w,log=T)+(1-v)*dnorm(y3,u2,w,log=T))
S4<-sum(dnorm(y4,u2,w,log=T))
#v<-(exp(2*dx)-exp(-2*dx))*(exp(2*d)+exp(-2*d))/((exp(2*dx)+exp(-2*dx))*(exp(2*d)-exp(-2*d)))
return(S1+S2+S3+S4)
}
#--------------Define MLE Solver--------------#用后验概率(EM算法)估计方程参数
MLE<-function(y1,y2,y3,y4,u1,u2,w,v){ #u1 u2分别为QTL基因型均值,v为QTL位置,w为方差
n1<-length(y1);n2<-n1+length(y2)
n3<-n2+length(y3);n<-n3+length(y4)
w1<-P1(y2,u1,u2,w,v);w2<-P2(y3,u1,u2,w,v)
u1<-(sum(y1)+sum(w1*y2)+sum((1-w2)*y3))/(n1+sum(w1)+sum(1-w2))
u2<-(sum((1-w1)*y2)+sum(w2*y3)+sum(y4))/(sum((1-w1))+sum(w2)+n-n3)
s1<-sum((y1-u1)^2)
s2<-sum(w1*(y2-u1)^2+(1-w1)*(y2-u2)^2)
s3<-sum(w2*(y3-u1)^2+(1-w2)*(y3-u2)^2)
s4<-sum((y4-u2)^2)
w<-sqrt((s1+s2+s3+s4)/n)
#v<-(exp(2*dx)-exp(-2*dx))*(exp(2*d)+exp(-2*d))/((exp(2*dx)+exp(-2*dx))*(exp(2*d)-exp(-2*d)))
#v<-(1-exp(-2*dx))/(1-exp(-2*d))
#v<-(sum(1-w1)+sum(w2))/(n3-n1)
return(c(u1,u2,w,v))
}#---This gets permutation distribution of MLEs for the Poplar data
dmax<-data0[1+1]-data0[1]
ML<-array(0,c(nc[2],4*dmax*10));LL<-array(0,c(nc[2],dmax*10))
#---------Define the Y classes for each Interval-----------
for(j in 1:nc[2]){
y1<-(cl[,j]==1)*Y;y1<-y1[y1>0]#第j列区间类型为1的行对应的表型
y2<-(cl[,j]==2)*Y;y2<-y2[y2>0]
y3<-(cl[,j]==3)*Y;y3<-y3[y3>0]
y4<-(cl[,j]==4)*Y;y4<-y4[y4>0]
dx<-(seq(data0[j],data0[j+1],by=0.1)-data0[j])[-1]
d<-data0[j+1]-data0[j]
v<-(exp(2*dx)-exp(-2*dx))*(exp(2*d)+exp(-2*d))/((exp(2*dx)+exp(-2*dx))*(exp(2*d)-exp(-2*d)))
for(k in 1:length(v)){
T<-c(mean(y1),mean(y4),sY,v[k])#设定EM算法初始值
T<-c(MLE(y1,y2,y3,y4,T[1],T[2],T[3],T[4]))
ML[j,(4*k-3):(4*k)]<-T
LL[j,k]<-2*(logL(y1,y2,y3,y4,T[1],T[2],T[3],T[4])-logL0[j])
}
}