#########################################################################
#   R functions to compute the operating characteristics for            #
#   squared array A(k \times k) via Monte Carlo Simulation,             #
#   corresponding to the manuscript titled,                             #
#   "Group testing case identification with biomarker information"		  # 
#                                                 Date: 08/06/2017.	    #
#########################################################################
rm(list=ls(all=TRUE))

#########################################################################
# Part I: Define biomarker distribution: f+, f-, f_epsilon, separately
#
# Improtant:
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 
#########################################################################
# The following illustrates how to construct these functions by using the biomarker distributions
# and the measurement error distribution identified by May et al. (2010)
##############################################################
# Neg.genr(B) generates B random samples from the user-chosen negative biomarker distribution.
# The following example defines the negative biomarker distribution given in May et al. (2010)
Neg.genr=function(B)
{
  para=c( 0.85,0.05,0.1,
          0,50,
          50,100,
          100,500)
  m.n=3
  multi.prob=para[1:m.n]
  temp.a=rmultinom(B,1,multi.prob)
  temp.b=rbind(	runif(B,para[m.n+1],para[m.n+2]),
                runif(B,para[m.n+3],para[m.n+4]),
                runif(B,para[m.n+5],para[m.n+6])
  )
  # This defines the distribution of negative individual biomarker
  # levels by a mixed uniform distribution, 
  #with probability 0.85 being U(0,50),
  # 0.05 being U(50,100), 0.1 being U(100,500).
  # The output is a random sample of size n from this mixed uniform distribution
  return(colSums(temp.a*temp.b))
}

############################################################
# Pos.genr(B) generates B random samples from the user-chosen positive biomarker distribution
# The following example defines the positive biomarker distribution given in May et al. (2010)
Pos.genr=function(B)
{
  para=c(0.93,0.07,
         2.7,1.6,.5,	#location,shape,scale
         2.7,3.18,.5	#location,shape,scale
  )
  m.n=2
  multi.prob=para[1:m.n]
  temp.a=rmultinom(B,1,multi.prob)
  temp.b=rbind(	
    rgamma(B,shape=para[m.n+2],scale=para[m.n+3])+para[m.n+1],
    rgamma(B,shape=para[m.n+5],scale=para[m.n+6])+para[m.n+4]
  )
  
  # This defines the distribution of positive individual biomarker
  # levels by a mixed gamma distribution, 
  #with probability 0.93 being Gamma(2.7,1.6,0.5),
  # 0.07 being Gamma(2.7,3.18,0.5).
  # The output is a random sample of size n from this mixed gamma distribution.
  return(10^(colSums(temp.a*temp.b)))
}

############################################################
# Given a vector of true biomarker levels \tilde C's, this function produces measured values of these true levels
# based on the user-chosen measurement error distribution.
# The following example defines the measurement error distribution given in May et al. (2010)
Mes.genr=function(c.tilde)
{
  para=c(0,0.12)
  sample=log10(c.tilde)
  
  # this defines that for true level \tilde C, 
  # the measurment C given \tilde C follows
  # N(log10(\tilde C), 0.12^2)
  
  return(10^(rnorm(	length(sample),
                sample+para[1],
                para[2])
  )
  )
}

#################################################################################
# Part II: obtain threshold, sensitivity, Specificity for individual testing
# Arguments:
# Pos.genr: see Part I
# Neg.genr: see Part I
# Mes.genr: see Part I
# B: number of Monte Carlo samples
# Outputs:
# tau.star: the threshold for individual testing (from maximizing the Youden index)
# Se: the sensitivity for individual testing
# Sp: the specificity for individual testing
IndTest=function(Pos.genr,Neg.genr,Mes.gner,B)
{
  # Generate B samples from f+
  sample.pos=Mes.genr(Pos.genr(B))
  # Generate B samples from f-
  sample.neg=Mes.genr(Neg.genr(B))
  
  # Maximize the Youden index to find the optimal threshold for individual testing
  int=c(median(sample.neg),median(sample.pos))
  Youden=function(t)
  {
    res=mean(sample.pos>t)+mean(sample.neg<t)
    return(-res)
  }
  tau.star=optimize(Youden,interval=int)$minimum
  
  # Sensitivity
  Se=mean(sample.pos>tau.star)
  
  # Specificity
  Sp=mean(sample.neg<tau.star)
  
  # return everything as in a list
  return(list(tau.star=tau.star,Se=Se,Sp=Sp))
}

############################################################################## 
#Part III: obtain threshold for pooled testing
# Function to compute the newly proposed threshold for pool of size n via Monte Carlo approximation
# Arguments:
# n       : a given pool size
# Pos.genr: see Part I
# Neg.genr: see Part I
# Mes.genr: see Part I
# B: number of Monte Carlo samples
Thres=function(n,Pos.genr,Neg.genr,Mes.genr, B)
{
  sample.pos=Mes.genr(rowMeans(cbind(Pos.genr(B), matrix(Neg.genr(B*(n-1)),B,n-1))))
  sample.neg=Mes.genr(rowMeans(matrix(Neg.genr(B*n),B,n)))	
  int=c(as.vector(quantile(sample.neg,.45)),quantile(sample.pos,.55))
  pooled.Youden=function(t)
  {
    res=mean(sample.pos>t)+mean(sample.neg<t)
    return(-res)
  }
  # maximize the pooled version of Youden index
  return(optimize(pooled.Youden,interval=int)$minimum)
}

################################################################################
# Part IV: Monte Carlo simulation procedure to approximate the operating 
# characteristis of a squared array algorithm in our biomarker-based framework.

#***********************************************************************#
# Supporting function
# This function simply does one Monte Carlo simulation for efficiency
# Arguments:
# A.size:       = c(k,k). The configuration of the squared array A(k \times k)
# p	            = disease prevalence
# thres         = c(threshold for size k, threshold for size k, threshold for size 1)
# Pos.genr: see Part I
# Neg.genr: see Part I
# Mes.genr: see Part I

Sim.Array.num=function(	A.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
{
  size=prod(A.size)
  n.row=A.size[1]
  n.col=A.size[2]
  Truth=rbinom(size,1,p)
  Truth.Con=Truth*Pos.genr(size) +(1-Truth)*Neg.genr(size)
  True.Status=matrix(Truth,n.row,n.col,byrow=TRUE)
  True.Concen=matrix(Truth.Con,n.row,n.col,byrow=TRUE)
  Ind=matrix(1:size,n.row,n.col,byrow=TRUE)
  
  eff=0
  row.z=I(Mes.genr(apply(True.Concen,1,mean))>thres[1])*1
  col.z=I(Mes.genr(apply(True.Concen,2,mean))>thres[2])*1
  eff=eff+sum(A.size)
  
  if(sum(row.z)==0 && sum(col.z)!=0)
  {
    eff=eff+sum(col.z)*n.row
  }
  if(sum(row.z)!=0 && sum(col.z)==0)
  {
    eff=eff+sum(row.z)*n.col
  }
  if(sum(row.z)!=0 && sum(col.z)!=0)
  {
    eff=eff+sum(row.z)*sum(col.z)
  }
  return(num=eff/size)
}

#***********************************************************************#
# Supporting function
# This function simply does one Monte Carlo simulation for pooling sensitivity and pooling specificity
# Arguments:
# A.size:       = c(k,k). The configuration of the squared array A(k \times k)
# p             = disease prevalence
# thres         = c(threshold for size k, threshold for size k, threshold for size 1)
# Pos.genr: see Part I
# Neg.genr: see Part I
# Mes.genr: see Part I


Sim.Array.sesp=function(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
{
  size=prod(A.size)
  n.row=A.size[1]
  n.col=A.size[2]
  
  Truth=c(0,rbinom(size-1,1,p))
  Truth.Con=Truth*Pos.genr(size) +(1-Truth)*Neg.genr(size)
  True.Status=matrix(Truth,n.row,n.col,byrow=TRUE)
  True.Concen=matrix(Truth.Con,n.row,n.col,byrow=TRUE)
  
  row.z=I(Mes.genr(apply(True.Concen,1,mean))>thres[1])*1
  col.z=I(Mes.genr(apply(True.Concen,2,mean))>thres[2])*1
  
  Diag.neg=0
  if(row.z[1]==1 && sum(col.z)==0)
  {
    Diag.neg=I(Mes.genr(True.Concen[1,1])>thres[3])*1
  }
  if(sum(row.z)==0 && col.z[1]==1)
  {
    Diag.neg=I(Mes.genr(True.Concen[1,1])>thres[3])*1
  }
  if(row.z[1]==1 && col.z[1]==1)
  {
    Diag.neg=I(Mes.genr(True.Concen[1,1])>thres[3])*1
  }
  
  
  Truth=c(1,rbinom(size-1,1,p))
  Truth.Con=Truth*Pos.genr(size) +(1-Truth)*Neg.genr(size)
  True.Status=matrix(Truth,n.row,n.col,byrow=TRUE)
  True.Concen=matrix(Truth.Con,n.row,n.col,byrow=TRUE)
  
  row.z=I(Mes.genr(apply(True.Concen,1,mean))>thres[1])*1
  col.z=I(Mes.genr(apply(True.Concen,2,mean))>thres[2])*1
  
  Diag.pos=0
  if(row.z[1]==1 && sum(col.z)==0)
  {
    Diag.pos=I(Mes.genr(True.Concen[1,1])>thres[3])*1
  }
  if(sum(row.z)==0 && col.z[1]==1)
  {
    Diag.pos=I(Mes.genr(True.Concen[1,1])>thres[3])*1
  }
  if(row.z[1]==1 && col.z[1]==1)
  {
    Diag.pos=I(Mes.genr(True.Concen[1,1])>thres[3])*1
  }
  
  return(list(Diag.pos=Diag.pos,Diag.neg=Diag.neg))
}


#***********************************************************************#
# Main function
# This function approximate EFF, PSE, PSP, PPV, NPV of H(n1:n2:...:nS) 
# for a given prevalence using M Monte Carlo replications
#
# Arguments:
# A.size:       = c(k,k). The configuration of the squared array A(k \times k)
# p             = disease prevalence
# thres         = c(threshold for size k, threshold for size k, threshold for size 1)
# Pos.genr: see Part I
# Neg.genr: see Part I
# Mes.genr: see Part I
# B: size of Monte Carlo simulation
Sim.Array=function(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr,B)
{
  num=0
  num2=0
  dpos=0
  dneg=0
  for(j in 1:B)
  {
    #print(j)
    temp=Sim.Array.num(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
    num=num+temp
    num2=num2+temp^2
    temp=Sim.Array.sesp(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
    dpos=dpos+temp$Diag.pos
    dneg=dneg+temp$Diag.neg
  }
  eff.mean=num/B
  eff.sd=sqrt(round((B*num2-num^2)/(B*(B-1)), 8))
  pse=dpos/B
  psp=1-dneg/B
  ppv=pse*p/(pse*p+(1-psp)*(1-p))
  npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p)
  
  # output everything in a list
  # EFF: approximated efficiency
  # SD:  approximated standard deviation of number of tests per specimen
  # PSE: approximated pooling sensitivty
  # PSP: approximated pooling specificity
  # PPV: approximated pooling positive predictive value
  # NPV: approximated pooling negative predcitive value
  return(list(EFF=eff.mean,SD=eff.sd,PSE=pse,PSP=psp,PPV=ppv,NPV=npv))
}


#***********************************************************************#
# Supporting function
# Function to exactly compute the operating characteristics of
# an S-stage hierarchial algorithm H(n1:n2:...:nS)
# when disease prevalence is p,
# but under traditional assumptions that were assumed in Kim et al. (2007).
# This funcion is coded based on Kim et al. (2007)
#
# Arguments:
# A.size:       = c(k,k). The configuration of the squared array A(k \times k)
# p             = disease prevalence
# Se            = sensitivity of individual testing
# Sp            = specificity of individual testing

Exact.Tradition.SArray.homo=function(A.size,p,Se,Sp)
{
  
  q=1-p
  
  f.gt=function(n)
  {
    res=(1-Sp)*q^n+Se*(1-q^n)
    return(res)
  }
  g.gt=function(n)
  {
    res=Se*f.gt(n)+(1-Se-Sp)*q^n*f.gt(n-1)
    return(res)
  }
  
  size=prod(A.size)
  a.n=A.size[1]
  
  beta0=function(c)
  {
    res=choose(a.n,c)*q^(a.n^2-c*a.n+c)*(1-q^(a.n-1))^c
    return(res)
  }
  beta1=function(c)
  {
    res=choose(a.n,c)*q^(a.n^2-c*a.n)*(1-q^a.n)^c-beta0(c)
    return(res)
  }
  gamma0=function(c)
  {
    res=(1-Sp)*(1-Se)^c*Sp^(a.n-c)
    return(res)
  }
  gamma1=function(c)
  {
    return(Se*Sp^(a.n-c)*(1-Se)^c)
  }
  
  s=0:a.n
  
  eff=2/a.n+g.gt(a.n)+2*sum( gamma0(s)*beta0(s)+gamma1(s)*beta1(s) )
  pse=Se^3+2*Se^2*(1-Se)*(1-f.gt(a.n))^(a.n-1)
  
  beta0y=function(c)
  {
    return(beta0(c)/q)
  }
  beta1y=function(c)
  {
    res=choose(a.n-1,c)*(1-q^a.n)^c*q^(a.n^2-a.n*c-1)+
      choose(a.n-1,c-1)*(1-q^a.n)^(c-1)*q^(a.n^2-a.n*c)*(1-q^(a.n-1))-
      beta0y(c)
  }
  
  psp=1-(1-Sp)*(f.gt(a.n-1))^2-2*(1-Sp)*sum( beta0y(s)*gamma0(s)+beta1y(s)*gamma1(s) )
  
  ppv=pse*p/(pse*p+(1-psp)*(1-p))
  npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p)
  
  # return everything in a list
  # EFF: testing efficiency
  # PSE: pooling sensitivity
  # PSP: pooling specificity
  # PPV: pooling positive predictive value
  # NPV: pooling negative predictive value  
  return(list(EFF=eff,PSE=pse,PSP=psp,PPV=ppv,NPV=npv))
}




#***********************************************************************#
# Final function
# This function approximate EFF, PSE, PSP, PPV, NPV of A(k \times k) using four methods
# i.e., it provides FOUR sets operating charatersitics of A(k \times k)
# 1st set: consider biomarker pooling and fix threshold to be the threshold for individual testing
# 2nd set: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size
# 3rd set: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index
# 4th set: under traditional assumption using methods provided by Kim et al. (2007)
# for a given prevalence using B Monte Carlo replications
#
# Arguments:
# A.size:       = c(k,k). The configuration of the squared array A(k \times k)
# p             = disease prevalence
# Pos.genr: see Part I
# Neg.genr: see Part I
# Mes.genr: see Part I
# B: size of Monte Carlo simulation

Sim.Array.all=function(S.size,p,Pos.genr,Neg.genr,Mes.genr,B)
{
	Ind.res=IndTest(Pos.genr,Neg.genr,Mes.gner,B)
	tau.star=Ind.res$tau.star
	print("The threshold for individual testing is")
	print(tau.star)
	Se=Ind.res$Se
	Sp=Ind.res$Sp
	print("The sensitivity for individual testing is")
	print(Se)
	print("The specificity for individual testing is")
	print(Sp)
	OP.fixt0=Sim.Array(A.size,p,c(tau.star,tau.star,tau.star),Pos.genr,Neg.genr,Mes.genr,B)
	OP.t0byn=Sim.Array(A.size,p,c(tau.star/A.size[1],tau.star/A.size[2],tau.star),Pos.genr,Neg.genr,Mes.genr,B)
	OP.newth=Sim.Array(A.size,p,c(Thres(A.size[1],Pos.genr,Neg.genr,Mes.genr,B),Thres(A.size[2],Pos.genr,Neg.genr,Mes.genr,B),tau.star),Pos.genr,Neg.genr,Mes.genr,B)
	OP.tradi=Exact.Tradition.SArray.homo(A.size, p, Se, Sp)
	res.biomarker=cbind(OP.fixt0,OP.t0byn,OP.newth) 
	res.traditional=cbind(OP.tradi)
	print("OP.fixt0: consider biomarker pooling and fix threshold to be the threshold for individual testing")
	print("OP.t0byn: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size")
	print("OP.newth: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index")
	print("OP.tradi: under traditional assumption using methods provided by Kim et al. (2007)")
	return(list(OP.biomk=res.biomarker,OP.tradi=res.traditional))
}


######################################################################
# An illustrative Example: 
# Using f-, f+, f_epsilon, provided by May et al. (2010)
# We defined these distributions in Part I, please check above
# Target algorithm: A(10 \times 10)
# Prevlance p=0.05
# Monte Carlo size is 10,000.

# Set preliminaries
# Monte Carlo size
B=10000
# We set the configure of the hierarchical algorithm by S.size
A.size=c(10,10) # i.e., Array(10 \times 10)
# The disease prevalence is
p=0.05

set.seed(100)
Sim.Array.all(A.size,p,Pos.genr,Neg.genr,Mes.genr,B)

# output:
#[1] "The threshold for individual testing is"
#[1] 434.2249
#[1] "The sensitivity for individual testing is"
#[1] 0.9896
#[1] "The specificity for individual testing is"
#[1] 0.9808
#[1] "OP.fixt0: consider biomarker pooling and fix threshold to be the threshold for individual testing"
#[1] "OP.t0byn: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size"
#[1] "OP.newth: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index"
#[1] "OP.tradi: under traditional assumption using methods provided by Kim et al. (2007)"
#$OP.biomk
#    OP.fixt0   OP.t0byn  OP.newth 
#EFF 0.253256   0.755195  0.393226 
#SD  0.05336722 0.171258  0.1218239
#PSE 0.4012     0.9897    0.8918   
#PSP 0.9993     0.9804    0.9954   
#PPV 0.9679131  0.7265986 0.9107435
#NPV 0.9694264  0.9994474 0.9943115

#$OP.tradi
#    OP.tradi 
#EFF 0.3849995
#PSE 0.9693037
#PSP 0.9972509
#PPV 0.9488677
#NPV 0.9983826