#########################################################
#                                                       #
#       R code for Resampling based Clustering          #
#                                                       #
#                  Han Li                               #
#          Last update: Sep 18, 2014                    #
#                                                       #
#########################################################


args=commandArgs(TRUE)
dataName=args[1]            #the prefix of output file
fileName=args[2]            #file route for input data
R=as.numeric(args[3])       #number of replicates
BurnIn=as.numeric(args[4])  #number of burn-in iterations
ITER=as.numeric(args[5])    #total number of iterations
flag=as.numeric(args[6])    #if flag=1, normalize the data (mean=0,sd=1); flag=0, no normalization



library(MCMCpack)
library(mcclust)


#input replicated measurement y


input=as.matrix(read.table(fileName))  #input the data
N=nrow(input)       #number of genes
J=ncol(input)/R     #number of replicates
K=N/50              #number of true clusters for synthetic data
clusterLabel=rep(seq(1,K),each=50) #true clustering for synthetic data

y=array(0,dim=c(N,J,R))

for(r in 1:R){
   temp=seq(1,ncol(input),R)+r-1
   y[,,r]=input[,temp]
   }


#mean profile yMean

yMean=matrix(0,N,J)
for(i in 1:N){
    for(j in 1:J){
        st=(j-1)*R+1
        ed=st+R-1
        yMean[i,j]=mean(input[i,st:ed])
        }
     }
     
     
#normalize the data

if(flag==1){

    tempMean=apply(yMean,1,mean)
    tempSd=apply(yMean,1,sd)

    for(i in 1:N){
        yMean[i,]=(yMean[i,]-mean(yMean[i,]))/sd(yMean[i,])
        } 
        
    for(r in 1:R){
        for(i in 1:N){y[i,,r]=(y[i,,r]-tempMean[i])/tempSd[i]}
        }
                
 }    
     

#prior specification

maxK=20   #the maximum number of clusters
theta0=mean(apply(yMean,2,mean))  #hyperparameter for component mean
kappa=10
kappa=1/kappa
eta1=eta2=median(apply(yMean,2,var))/2 #the upper bound for component variance and random effect variance
alpha=0.5
lowProb=0.5        #lower bound for phi
MHPropSd=sqrt(eta1)/2   #MH proposal sd for sigma2 and tau2
MHprob=0.05      #MH proposal sd for phi

               
#iteration parameters  

conClust=repClust=rep(0,N)    #latent consensus clustering
repClustAll=matrix(0,ITER,N)   #c
theta=matrix(0,maxK,J)    #component mean
sigma2=matrix(0,maxK,J)   #component variance
mu=rep(0,N)            #gene specific random effect
tau2=rep(eta1/2,maxK)    #variance of mu
compWeight=rep(0,maxK)   #component weight
Phi=matrix(0.8/(maxK-1),maxK,maxK)   #phi
repY=matrix(0,N,J)      #random sample replicate data
diag(Phi)=0.8
repClustSize=rep(0,maxK)

                             

#use Kmeans to find the initial clustering based on the mean profile   

initK=10
KmeansObj=kmeans(yMean,centers=initK,iter.max=50)
initClust=KmeansObj$cluster
for(k in 1:initK){
    tempsize=sum(initClust==k)
    if(tempsize>1){
        tempmat=yMean[initClust==k,]
        theta[k,]=apply(tempmat,2,mean)
        sigma2[k,]=apply(tempmat,2,var)
        }
    if(tempsize<2){
        theta[k,]=rnorm(J,mean=theta0,sd=sqrt(eta1))
        sigma2[k,]=runif(J,0,eta1)
        }
    }



for(k in (initK+1):maxK){
    theta[k,]=rnorm(J,mean=theta0,sd=sqrt(eta1))
    sigma2[k,]=runif(J,0,eta1)
    }


clustSize=rep(0,maxK)
for(k in 1:initK){clustSize[k]=sum(initClust==k)}
compWeight=(clustSize+0.5)/(N+0.5*maxK)
for(i in 1:N){
    if(runif(1)<0.8){repClust[i]=initClust[i]}
    else{repClust[i]=sample(initK,1)}
    }


         

for(iter in 2:ITER){


    #sample replicates
    
    for(i in 1:N){
        for(j in 1:J){
            index=sample(R,1)
            repY[i,j]=y[i,j,index]
            }
          }


   #sample nu

    logCompWeight=log(compWeight)

    for(i in 1:N){

        temp=logCompWeight+log(Phi[,repClust[i]])
        temp=temp-max(temp)
        temp=exp(temp)
        temp=temp/sum(temp)
        conClust[i]=sample(maxK,1,prob=temp)
        }
        

  
   #sample c


   for(i in 1:N){
      crit=rep(0,maxK)
      for(k in 1:maxK){
          temp1=(repY[i,]-theta[k,]-mu[i])^2/sigma2[k,]
          temp1=-temp1/2-log(sigma2[k,])/2
          crit[k]=sum(temp1)
          }

      crit=crit+log(Phi[conClust[i],])   
      crit=crit-max(crit)
      crit=exp(crit)
      crit=crit/sum(crit)
      repClust[i]=sample(maxK,1,prob=crit)
      }



    for(k in 1:maxK){repClustSize[k]=sum(repClust==k)}
    validK=seq(1,maxK)[repClustSize>1]
    for(k in validK){
           
        #sample theta: mean vector; sigma2: variance of theta
        
        tempmat=repY[repClust==k,]-mu[repClust==k]
        tempMean=apply(tempmat,2,mean)
        for(j in 1:J){
            temp1=theta0*kappa+repClustSize[k]*tempMean[j]/(kappa+repClustSize[k])
            temp2=sigma2[k,j]/(repClustSize[k]+kappa)
            theta[k,j]=rnorm(1,mean=temp1,sd=sqrt(temp2))

            propSigma2=rnorm(1,mean=sigma2[k,j],sd=MHPropSd)
            if(propSigma2<0){propSigma2=abs(propSigma2)}
            if(propSigma2>eta1){propSigma2=max(2*eta1-propSigma2,0.0001)}
            temp3=-repClustSize[k]/2*(log(propSigma2)-log(sigma2[k,j]))
            temp4=sum((tempmat[,j]-theta[k,j])^2)/propSigma2
            temp5=sum((tempmat[,j]-theta[k,j])^2)/sigma2[k,j]
            crit=temp3-(temp4-temp5)/2
            if(log(runif(1))<crit){sigma2[k,j]=propSigma2}
            }
        
        #sample mu: gene specific random effect
            
        tempIndex=seq(1,N)[repClust==k]
        for(i in tempIndex){
            temp1=repY[i,]-theta[k,]
            temp2=1/sigma2[k,]
            temp3=sum(temp2)
            tempMean=sum(temp1*temp2)/temp3
            tempVar=1/(temp3+1/tau2)
            mu[i]=rnorm(1,mean=tempMean,sd=sqrt(tempVar))
            }
            
            
        #add the idenfiability constrait that mean of mu=0
        
        temp=mean(mu[repClust==k])
        mu[repClust==k]=mu[repClust==k]-temp
        theta[k,]=theta[k,]+temp
           
                    
         #sample tau2: variance of mu
         
         
         propTau2=rnorm(1,mean=tau2[k],sd=MHPropSd)
         if(propTau2<0){propTau2=abs(propTau2)}
         if(propTau2>eta2){propTau2=max(2*eta2-propTau2,0.0001)}
         temp=sum(mu[repClust==k]^2)
         crit=-repClustSize[k]/2*(log(propTau2)-log(tau2[k]))-temp/2*(1/propTau2-1/tau2[k])
         if(log(runif(1))<crit){tau2[k]=propTau2}            
            
        }



    invalidK=seq(1,maxK)[repClustSize<=1]
    for(k in invalidK){
        theta[k,]=rnorm(J,mean=theta0,sd=sqrt(eta1))
        sigma2[k,]=runif(J,0,eta1)
        tau2[k]=runif(1,0,eta2)
        if(repClustSize[k]==1){
            i=seq(1,N)[repClust==k]
            mu[i]=rnorm(1,mean(repY[i,]-theta[k,]),sqrt(tau2[k]))
            }
        }


    #sample compweight
    
    for(k in 1:maxK){clustSize[k]=sum(conClust==k)}
    compWeight=rdirichlet(1,alpha+clustSize)

                                          
    #sample Phi

    transTable=matrix(0,maxK,maxK)
    for(i in 1:N){transTable[conClust[i],repClust[i]]=transTable[conClust[i],repClust[i]]+1}
    validK=seq(1,maxK)[clustSize>1]
    for(k in validK){
        prop=Phi[k,k]+runif(1,-MHprob,MHprob)
        if(prop<lowProb){prop=2*lowProb-prop}
        if(prop>1){prop=2-prop}
        crit=transTable[k,k]*log(prop/Phi[k,k])+(clustSize[k]-transTable[k,k])*log((1-prop)/(1-Phi[k,k]))
        if(log(runif(1))<crit){Phi[k,k]=prop}
        temp=rdirichlet(1,alpha+transTable[k,-k])
        Phi[k,-k]=temp*(1-Phi[k,k])
        }
        
     invalidK=seq(1,maxK)[clustSize<=1]
     for(k in invalidK){
       Phi[k,k]=runif(1,lowProb,1)
       temp=rdirichlet(1,rep(1,maxK-1))
       Phi[k,-k]=temp*(1-Phi[k,k])
       }   

    
        
    #record c 
    
    repClustAll[iter,]=repClust
    
 
    }


#########################################################

#result analysis

#########################################################


#posterior pairwise probability matrix 


clustRes=repClustAll[seq(BurnIn,ITER,2),] #retain every two iteration after burn-in
numSample=nrow(clustRes) 
pairProb=matrix(0,N,N)
for(i in 1:(N-1)){
    for(j in (i+1):N){
        pairProb[i,j]=pairProb[j,i]=sum(clustRes[,i]==clustRes[,j])
        }
     }


pairProb=pairProb/numSample
diag(pairProb)=1


#average linkage hierarchical clustering

distRes=as.dist(1-pairProb)
hcRes=hclust(distRes,method="average")
iterK=rep(0,maxK)
for(it in 1:numSample){
    temp=rep(0,maxK)
    for(i in 1:N){temp[clustRes[it,i]]=temp[clustRes[it,i]]+1}
    tempK=sum(temp>1)
    iterK[tempK]=iterK[tempK]+1
    }
modeK=which.max(iterK)
outputClust=cutree(hcRes,k=modeK)
ARI=arandi(outputClust,clusterLabel)  #adjusted Rand index
result=c(modeK,ARI)
print(result)


write.table(outputClust,paste(dataName,"_clust.txt",sep=""),col.names=F,row.names=F,sep="\t",quote=F)
write.table(result,paste(dataName,"_res.txt",sep=""),col.names=F,row.names=F,sep="\t",quote=F)

