#########################################################################
#   R functions to calculate the operating characteristics for          #
#   hierarchical H(n1:n2:...:nS), when biomarker distributions are      #
#   normal, corresponding to the manuscript titled,                     #
#   "Group testing case identification with biomarker information"		  #                                                 #
#                                                 Date: 08/06/2017.	    #
#########################################################################
rm(list=ls(all=TRUE))


#***********************************************************************#
# Supporting function
# Function to exactly compute the three thresholds for pool of size Gseq
# Arguments:
# n.sep:        = a sequence of pool sizes, for example, one could take
#                 n.seq=c(1,2,10). This function will produce thresholds
#                 for pools of size 1,2,10, respectively.
# Pos.para      = c(mu+, sigma+), where mu+ and sigma+ are the mean and 
#                 standard deviation of the positive biomarker distribution
#                 respectively. For example, if it is N(6,1), we take
#                 Pos.para=c(6,1).
# Neg.para      = c(mu-, sigma-), where mu- and sigma- are the mean and 
#                 standard deviation of the negative biomarker distribution
#                 respectively. For example, if it is N(3,0.25), we take
#                 Neg.para=c(3,0.5).
# Mes.para      = c(mu_epsilon, sigma_epsilon), where mu_epsilon and 
#				  sigma-epsilon are the mean and
#                 standard deviation of the measrument error distribution
#                 respectively. For example, if it is N(0,0.0025), we take
#                 Mes.para=c(0,0.05).
Exact.Rules.thres=function(n.sep,Pos.para,Neg.para,Mes.para)
{
  
  # Define some variables
  up=Pos.para[1]
  sp=Pos.para[2]
  un=Neg.para[1]
  sn=Neg.para[2]
  ue=Mes.para[1]
  se=Mes.para[2]
  
  # Calculate the optimal threshold for individual testing
  Youden=function(t0)
  {
    res=pnorm(t0,un+ue,sqrt(sn^2+se^2))+1-pnorm(t0,up+ue,sqrt(sp^2+se^2))
    return(-res)
  }
  tau.star=optimize(Youden,interval=c(un,up))$minimum
  
  # Based on the optimal threshold, compute the sensitivity and specificity of individual testing
  Sp=pnorm(tau.star,un+ue,sqrt(sn^2+se^2))
  Se=1-pnorm(tau.star,up+ue,sqrt(sp^2+se^2))
  
  # The first type of threshold for pools: fix it to be the individual optimal threshold
  ruleA=rep(tau.star,length(n.sep))
  
  # The second type of threshold for pools: divide the individual optimal threshold by the pool size
  ruleB=tau.star/n.sep
  
  # The new type of threshold we proposed in the article
  ruleC=c()
  for(n in n.sep)
  {
    if(n==1){
      ruleC=c(ruleC,tau.star)
    }else{
      pooled.Youden=function(t0)
      {
        D=rep(1,n)/n
        Tv=rep(0,n)
        SP=pnorm(t0,
                 sum( D*(up*Tv+un*(1-Tv)) )+ue,
                 sqrt(sum(sp^2*Tv+sn^2*(1-Tv))/(n^2)+se^2)
        )[1]
        Tv[1]=1
        SE=1-pnorm(t0,
                   sum( D*(up*Tv+un*(1-Tv)) )+ue,
                   sqrt(sum(sp^2*Tv+sn^2*(1-Tv))/(n^2)+se^2)
        )[1]
        return(-(SE+SP))
      }
      ruleC=c(ruleC,optimize(pooled.Youden,interval=c(un,((n-1)*un+up)/n))$minimum)}
  }
  
  #	Return all results in a list.
  # tau.star: optimal threshold for individual testing by maximing the Youden index
  # Se: sensitivity of individual testing 
  # Sp: specificity of individual testing
  # ruleA: thresholds for pools of size n.seq by fixing them to be tau.star
  # ruleB: thresholds for pools of size n.seq by dividing tau.star by n.seq
  # ruleC: our newly proposed threshold for pooled by maximizing a pooled version of Youden index
  return(list(tau.star=tau.star,Se=Se,Sp=Sp,ruleA=ruleA,ruleB=ruleB,ruleC=ruleC))
}



#***********************************************************************#
# Supporting function
# Function to exactly compute the operating characteristics of
# an S-stage hierarchial algorithm H(n1:n2:...:nS)
# when disease prevalence is p,
# but under traditional assumptions that were assumed in Kim et al. (2007).
# This funcion is coded based on Kim et al. (2007)
#
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# Se            = sensitivity of individual testing
# Sp            = specificity of individual testing

Exact.Tradition.Hierarchical.homo=function(S.size, p, Se, Sp)
{
  S=length(S.size)
  q=1-p
  eff=1
  for(s in 1:(S-1))
  {
    temp=q^(S.size[1])*(1-Sp)^s+(1-q^(S.size[s]))*Se^s
    if(s>1)
    {
      for(j in 1:(s-1))
      {
        temp=temp+(q^(S.size[j+1])-q^(S.size[j]))*Se^j*(1-Sp)^(s-j)
      }
    }
    eff=eff+S.size[1]/S.size[s+1]*temp
  }
  eff=eff/S.size[1]
  
  psp=q^(S.size[1]-1)*(1-Sp)^S
  for(s in 2:S)
  {
    psp=psp+(q^(S.size[s]-1)-q^(S.size[s-1]-1))*Se^(s-1)*(1-Sp)^(S-s+1)
  }
  psp=1-psp
  pse=Se^S
  ppv=pse*p/(pse*p+(1-psp)*(1-p))
  npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p)
  
  # return everything in a list
  # EFF: testing efficiency
  # PSE: pooling sensitivity
  # PSP: pooling specificity
  # PPV: pooling positive predictive value
  # NPV: pooling negative predictive value  
  return(list(EFF=eff,PSE=pse,PSP=psp,PPV=ppv,NPV=npv))
}


#***********************************************************************#
# Main function for normal biomarkers
# Function to exactly compute the operating characteristics of
# an S-stage hierarchial algorithm H(n1:n2:...:nS)
# when disease prevalence is p and
# biomarker distributions are normal.
#
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# THRED         = a sequence of thresholds used for pools of sizes n1, n2, 
#                 n3, ..., nS, respectively. Note that, one should make
#                 sure that the order of thresholds in THRED is the same
#                 as the one of pool sizes in S.size.
# Pos.para      = c(mu+, sigma+), where mu+ and sigma+ are the mean and 
#                 standard deviation of the positive biomarker distribution
#                 respectively. For example, if it is N(6,1), we take
#                 Pos.para=c(6,1).
# Neg.para      = c(mu-, sigma-), where mu- and sigma- are the mean and 
#                 standard deviation of the negative biomarker distribution
#                 respectively. For example, if it is N(3,0.25), we take
#                 Neg.para=c(3,0.5).
# Mes.para      = c(mu0, sigma0), where mu0 and sigma0 are the mean and
#                 standard deviation of the measrument error distribution
#                 respectively. For example, if it is N(0,0.0025), we take
#                 Mes.para=c(0,0.05).

Exact.Hierarchical=function(S.size, p, THRED, Pos.para, Neg.para, Mes.para)
{
  require(mvtnorm)
  
  # define some variables
  up=Pos.para[1]
  sp=Pos.para[2]
  un=Neg.para[1]
  sn=Neg.para[2]
  ue=Mes.para[1]
  se=Mes.para[2]
  Stage=length(S.size)

  ########################################
  # Supporting function
  General.space=function(temp.size)
  {
    if(length(temp.size)==1)
    {
      res=matrix(0:temp.size)	
    }
    if(length(temp.size)>1)
    {
      temp.s=temp.size[-length(temp.size)]-temp.size[-1]
      temp.s=temp.s[length(temp.s):1]
      temp.s=c(temp.size[length(temp.size)],temp.s)
      res=matrix(0,prod(temp.s+1),length(temp.s))
      temp.ss=c(0,cumsum(temp.s))
      
      for(ss in 1:(length(temp.s)-1))
      {
        res[,ss]=rep(0:(temp.s[ss]), rep(prod(temp.s[-(1:ss)]+1),temp.s[ss]+1) )
      }
      res[,length(temp.s)]=0:(temp.s[length(temp.s)])
    }
    return(res)
  }
  
  #########################################
  # Supporting function
  joint.sesp=function(tempT,ss,temp.S,THRED)
  {
    D=matrix(1,1,temp.S[1])/temp.S[1]
    if(ss>1)
    {
      for(tem.s in 2:ss)
      {
        D=rbind(D, (rep(c(1,0),c(temp.S[tem.s],temp.S[1]-temp.S[tem.s])))/temp.S[tem.s])
      }
    }
    lb=THRED[1:ss]
    ub=rep(Inf,ss)
    res=as.vector(pmvnorm(lower=lb,upper=ub,mean=as.vector(D%*%matrix(tempT*up+(1-tempT)*un+ue)), 
                          sigma=D%*%diag( tempT*sp^2+(1-tempT)*sn^2 )%*%t(D)+diag(se^2,ss))[1])
    return(res)
  }
  
  #####################################
  # Testing efficiency
  eff=1/S.size[1]
  for(ss in 1:(Stage-1))
  {
    temp.S=S.size[1:ss]
    temp.space=General.space(temp.S)
    temp.size=apply(temp.space,2,max)
    temp=0
    for(tem.m in 1:nrow(temp.space))
    {
      temp.T=rep(rep(c(0,1),ss),as.vector(matrix(c(temp.size-temp.space[tem.m,],temp.space[tem.m,]),2,,byrow=TRUE)))
      temp=temp+joint.sesp(temp.T,ss,temp.S,THRED)*prod(dbinom(temp.space[tem.m,],temp.size,p))
    }
    eff=eff+1/S.size[ss+1]*temp
  }
  
  #####################################
  # Pooling specificity
  # Pooling sensitivity
  psp=0
  pse=0
  temp.space=General.space(S.size)
  temp.size=apply(temp.space,2,max)
  temp.space=as.matrix(temp.space[-(1:(nrow(temp.space)/2)),-1])
  
  for(tem.m in 1:nrow(temp.space))
  {
    temp.TT=rep(rep(c(0,1),Stage),as.vector(matrix(c(temp.size-c(1,temp.space[tem.m,]),c(1,temp.space[tem.m,])),2,,byrow=TRUE)))
    pse=pse+joint.sesp(temp.TT,Stage,S.size,THRED)*prod(dbinom(temp.space[tem.m,],temp.size[-1],p))
    temp.TT[1]=0
    psp=psp+joint.sesp(temp.TT,Stage,S.size,THRED)*prod(dbinom(temp.space[tem.m,],temp.size[-1],p))
  }
  psp=1-psp
  ppv=pse*p/(pse*p+(1-psp)*(1-p))
  npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p)
  
  # return everything in a list
  # EFF: testing efficiency
  # PSE: pooling sensitivity
  # PSP: pooling specificity
  # PPV: pooling positive predictive value
  # NPV: pooling negative predictive value 
  return(list(EFF=eff,PSE=pse,PSP=psp,PPV=ppv,NPV=npv))
}


#***********************************************************************#
# Final function for normal biomarkers
# Function to exactly compute the operating characteristics of
# an S-stage hierarchial algorithm H(n1:n2:...:nS)
# when disease prevalence is p and
# biomarker distributions are normal.
#
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# THRED         = a sequence of thresholds used for pools of sizes n1, n2, 
#                 n3, ..., nS, respectively. Note that, one should make
#                 sure that the order of thresholds in THRED is the same
#                 as the one of pool sizes in S.size.
# Pos.para      = c(mu+, sigma+), where mu+ and sigma+ are the mean and 
#                 standard deviation of the positive biomarker distribution
#                 respectively. For example, if it is N(6,1), we take
#                 Pos.para=c(6,1).
# Neg.para      = c(mu-, sigma-), where mu- and sigma- are the mean and 
#                 standard deviation of the negative biomarker distribution
#                 respectively. For example, if it is N(3,0.25), we take
#                 Neg.para=c(3,0.5).
# Mes.para      = c(mu0, sigma0), where mu0 and sigma0 are the mean and
#                 standard deviation of the measrument error distribution
#                 respectively. For example, if it is N(0,0.0025), we take
#                 Mes.para=c(0,0.05).
#
#Ouputs:
# it provides FOUR sets operating charatersitics of H(n1:n2:...:nS)
# 1st set: consider biomarker pooling and fix threshold to be the threshold for individual testing
# 2nd set: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size
# 3rd set: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index
# 4th set: under traditional assumption using methods provided by Kim et al. (2007)

Exact.Hierarchical.all=function(S.size, p, Pos.para, Neg.para, Mes.para)
{
	Rules=Exact.Rules.thres(S.size,Pos.para,Neg.para,Mes.para)
	print("The threshold for individual testing is")
	print(Rules$tau.star)
	Se=Rules$Se
	Sp=Rules$Sp
	print("The sensitivity for individual testing is")
	print(Se)
	print("The specificity for individual testing is")
	print(Sp)
	
	print("OP.tradi: under traditional assumption using methods provided by Kim et al. (2007)")
	OP.tradi=Exact.Tradition.Hierarchical.homo(S.size, p, Rules$Se, Rules$Sp)
	print("OP.fixt0: consider biomarker pooling and fix threshold to be the threshold for individual testing")
	OP.fixt0=Exact.Hierarchical(S.size, p, Rules$ruleA, Pos.para,Neg.para,Mes.para)
	print("OP.t0byn: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size")
	OP.t0byn=Exact.Hierarchical(S.size, p, Rules$ruleB, Pos.para,Neg.para,Mes.para)
	print("OP.newth: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index")
	OP.newth=Exact.Hierarchical(S.size, p, Rules$ruleC, Pos.para,Neg.para,Mes.para)
	return(list(cbind(OP.fixt0,OP.t0byn,OP.newth,OP.tradi)))

}
#########################################################################
# For illustration
# Set distributions, prevalence, and configuration of the hierarchial algorithm:

# Positive biomarker distribution N(6,1)
Pos.para=c(6,1)
# Negative biomarker distribution N(3,0.25)
Neg.para=c(3,0.5)
# Measurement error N(0, 0.0025)
Mes.para=c(0,0.05)
# Prevalence is p=0.05
p=0.05
# Algorithm is H(8:4:1)
S.size=c(8,4,1)

set.seed(100)
Exact.Hierarchical.all(S.size, p, Pos.para, Neg.para, Mes.para)

#Outputs:
#[1] "The threshold for individual testing is"
#[1] 4.114972
#[1] "The sensitivity for individual testing is"
#[1] 0.9701279
#[1] "The specificity for individual testing is"
#[1] 0.9867524
#[1] "OP.tradi: under traditional assumption using methods provided by Kim et al. (2007)"
#[1] "OP.fixt0: consider biomarker pooling and fix threshold to be the threshold for individual testing"
#[1] "OP.t0byn: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size"
#[1] "OP.newth: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index"
#[[1]]
#    OP.fixt0   OP.t0byn  OP.newth  OP.tradi 
#EFF 0.1304224  1.375     0.4156434 0.3854637
#PSE 0.03188749 0.9701279 0.8138943 0.913034 
#PSP 0.9999088  0.9867524 0.9945616 0.9981931
#PPV 0.9484743  0.7939948 0.8873452 0.9637608
#NPV 0.9515129  0.9984092 0.9902475 0.9954355