######################################################################### # R functions to compute the operating characteristics for # # squared array A(k \times k) via Monte Carlo Simulation, # # corresponding to the manuscript titled, # # "Group testing case identification with biomarker information" # # Date: 08/06/2017. # ######################################################################### rm(list=ls(all=TRUE)) ######################################################################### # Part I: Define biomarker distribution: f+, f-, f_epsilon, separately # # Improtant: # Pos.genr(B) should generate B samples from f+ # Neg.genr(B) should generate B samples from f- # Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon ######################################################################### # The following illustrates how to construct these functions by using the biomarker distributions # and the measurement error distribution identified by May et al. (2010) ############################################################## # Neg.genr(B) generates B random samples from the user-chosen negative biomarker distribution. # The following example defines the negative biomarker distribution given in May et al. (2010) Neg.genr=function(B) { para=c( 0.85,0.05,0.1, 0,50, 50,100, 100,500) m.n=3 multi.prob=para[1:m.n] temp.a=rmultinom(B,1,multi.prob) temp.b=rbind( runif(B,para[m.n+1],para[m.n+2]), runif(B,para[m.n+3],para[m.n+4]), runif(B,para[m.n+5],para[m.n+6]) ) # This defines the distribution of negative individual biomarker # levels by a mixed uniform distribution, #with probability 0.85 being U(0,50), # 0.05 being U(50,100), 0.1 being U(100,500). # The output is a random sample of size n from this mixed uniform distribution return(colSums(temp.a*temp.b)) } ############################################################ # Pos.genr(B) generates B random samples from the user-chosen positive biomarker distribution # The following example defines the positive biomarker distribution given in May et al. (2010) Pos.genr=function(B) { para=c(0.93,0.07, 2.7,1.6,.5, #location,shape,scale 2.7,3.18,.5 #location,shape,scale ) m.n=2 multi.prob=para[1:m.n] temp.a=rmultinom(B,1,multi.prob) temp.b=rbind( rgamma(B,shape=para[m.n+2],scale=para[m.n+3])+para[m.n+1], rgamma(B,shape=para[m.n+5],scale=para[m.n+6])+para[m.n+4] ) # This defines the distribution of positive individual biomarker # levels by a mixed gamma distribution, #with probability 0.93 being Gamma(2.7,1.6,0.5), # 0.07 being Gamma(2.7,3.18,0.5). # The output is a random sample of size n from this mixed gamma distribution. return(10^(colSums(temp.a*temp.b))) } ############################################################ # Given a vector of true biomarker levels \tilde C's, this function produces measured values of these true levels # based on the user-chosen measurement error distribution. # The following example defines the measurement error distribution given in May et al. (2010) Mes.genr=function(c.tilde) { para=c(0,0.12) sample=log10(c.tilde) # this defines that for true level \tilde C, # the measurment C given \tilde C follows # N(log10(\tilde C), 0.12^2) return(10^(rnorm( length(sample), sample+para[1], para[2]) ) ) } ################################################################################# # Part II: obtain threshold, sensitivity, Specificity for individual testing # Arguments: # Pos.genr: see Part I # Neg.genr: see Part I # Mes.genr: see Part I # B: number of Monte Carlo samples # Outputs: # tau.star: the threshold for individual testing (from maximizing the Youden index) # Se: the sensitivity for individual testing # Sp: the specificity for individual testing IndTest=function(Pos.genr,Neg.genr,Mes.gner,B) { # Generate B samples from f+ sample.pos=Mes.genr(Pos.genr(B)) # Generate B samples from f- sample.neg=Mes.genr(Neg.genr(B)) # Maximize the Youden index to find the optimal threshold for individual testing int=c(median(sample.neg),median(sample.pos)) Youden=function(t) { res=mean(sample.pos>t)+mean(sample.negtau.star) # Specificity Sp=mean(sample.negt)+mean(sample.negthres[1])*1 col.z=I(Mes.genr(apply(True.Concen,2,mean))>thres[2])*1 eff=eff+sum(A.size) if(sum(row.z)==0 && sum(col.z)!=0) { eff=eff+sum(col.z)*n.row } if(sum(row.z)!=0 && sum(col.z)==0) { eff=eff+sum(row.z)*n.col } if(sum(row.z)!=0 && sum(col.z)!=0) { eff=eff+sum(row.z)*sum(col.z) } return(num=eff/size) } #***********************************************************************# # Supporting function # This function simply does one Monte Carlo simulation for pooling sensitivity and pooling specificity # Arguments: # A.size: = c(k,k). The configuration of the squared array A(k \times k) # p = disease prevalence # thres = c(threshold for size k, threshold for size k, threshold for size 1) # Pos.genr: see Part I # Neg.genr: see Part I # Mes.genr: see Part I Sim.Array.sesp=function(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr) { size=prod(A.size) n.row=A.size[1] n.col=A.size[2] Truth=c(0,rbinom(size-1,1,p)) Truth.Con=Truth*Pos.genr(size) +(1-Truth)*Neg.genr(size) True.Status=matrix(Truth,n.row,n.col,byrow=TRUE) True.Concen=matrix(Truth.Con,n.row,n.col,byrow=TRUE) row.z=I(Mes.genr(apply(True.Concen,1,mean))>thres[1])*1 col.z=I(Mes.genr(apply(True.Concen,2,mean))>thres[2])*1 Diag.neg=0 if(row.z[1]==1 && sum(col.z)==0) { Diag.neg=I(Mes.genr(True.Concen[1,1])>thres[3])*1 } if(sum(row.z)==0 && col.z[1]==1) { Diag.neg=I(Mes.genr(True.Concen[1,1])>thres[3])*1 } if(row.z[1]==1 && col.z[1]==1) { Diag.neg=I(Mes.genr(True.Concen[1,1])>thres[3])*1 } Truth=c(1,rbinom(size-1,1,p)) Truth.Con=Truth*Pos.genr(size) +(1-Truth)*Neg.genr(size) True.Status=matrix(Truth,n.row,n.col,byrow=TRUE) True.Concen=matrix(Truth.Con,n.row,n.col,byrow=TRUE) row.z=I(Mes.genr(apply(True.Concen,1,mean))>thres[1])*1 col.z=I(Mes.genr(apply(True.Concen,2,mean))>thres[2])*1 Diag.pos=0 if(row.z[1]==1 && sum(col.z)==0) { Diag.pos=I(Mes.genr(True.Concen[1,1])>thres[3])*1 } if(sum(row.z)==0 && col.z[1]==1) { Diag.pos=I(Mes.genr(True.Concen[1,1])>thres[3])*1 } if(row.z[1]==1 && col.z[1]==1) { Diag.pos=I(Mes.genr(True.Concen[1,1])>thres[3])*1 } return(list(Diag.pos=Diag.pos,Diag.neg=Diag.neg)) } #***********************************************************************# # Main function # This function approximate EFF, PSE, PSP, PPV, NPV of H(n1:n2:...:nS) # for a given prevalence using M Monte Carlo replications # # Arguments: # A.size: = c(k,k). The configuration of the squared array A(k \times k) # p = disease prevalence # thres = c(threshold for size k, threshold for size k, threshold for size 1) # Pos.genr: see Part I # Neg.genr: see Part I # Mes.genr: see Part I # B: size of Monte Carlo simulation Sim.Array=function(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr,B) { num=0 num2=0 dpos=0 dneg=0 for(j in 1:B) { #print(j) temp=Sim.Array.num(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr) num=num+temp num2=num2+temp^2 temp=Sim.Array.sesp(A.size,p,thres,Pos.genr,Neg.genr,Mes.genr) dpos=dpos+temp$Diag.pos dneg=dneg+temp$Diag.neg } eff.mean=num/B eff.sd=sqrt(round((B*num2-num^2)/(B*(B-1)), 8)) pse=dpos/B psp=1-dneg/B ppv=pse*p/(pse*p+(1-psp)*(1-p)) npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p) # output everything in a list # EFF: approximated efficiency # SD: approximated standard deviation of number of tests per specimen # PSE: approximated pooling sensitivty # PSP: approximated pooling specificity # PPV: approximated pooling positive predictive value # NPV: approximated pooling negative predcitive value return(list(EFF=eff.mean,SD=eff.sd,PSE=pse,PSP=psp,PPV=ppv,NPV=npv)) } #***********************************************************************# # Supporting function # Function to exactly compute the operating characteristics of # an S-stage hierarchial algorithm H(n1:n2:...:nS) # when disease prevalence is p, # but under traditional assumptions that were assumed in Kim et al. (2007). # This funcion is coded based on Kim et al. (2007) # # Arguments: # A.size: = c(k,k). The configuration of the squared array A(k \times k) # p = disease prevalence # Se = sensitivity of individual testing # Sp = specificity of individual testing Exact.Tradition.SArray.homo=function(A.size,p,Se,Sp) { q=1-p f.gt=function(n) { res=(1-Sp)*q^n+Se*(1-q^n) return(res) } g.gt=function(n) { res=Se*f.gt(n)+(1-Se-Sp)*q^n*f.gt(n-1) return(res) } size=prod(A.size) a.n=A.size[1] beta0=function(c) { res=choose(a.n,c)*q^(a.n^2-c*a.n+c)*(1-q^(a.n-1))^c return(res) } beta1=function(c) { res=choose(a.n,c)*q^(a.n^2-c*a.n)*(1-q^a.n)^c-beta0(c) return(res) } gamma0=function(c) { res=(1-Sp)*(1-Se)^c*Sp^(a.n-c) return(res) } gamma1=function(c) { return(Se*Sp^(a.n-c)*(1-Se)^c) } s=0:a.n eff=2/a.n+g.gt(a.n)+2*sum( gamma0(s)*beta0(s)+gamma1(s)*beta1(s) ) pse=Se^3+2*Se^2*(1-Se)*(1-f.gt(a.n))^(a.n-1) beta0y=function(c) { return(beta0(c)/q) } beta1y=function(c) { res=choose(a.n-1,c)*(1-q^a.n)^c*q^(a.n^2-a.n*c-1)+ choose(a.n-1,c-1)*(1-q^a.n)^(c-1)*q^(a.n^2-a.n*c)*(1-q^(a.n-1))- beta0y(c) } psp=1-(1-Sp)*(f.gt(a.n-1))^2-2*(1-Sp)*sum( beta0y(s)*gamma0(s)+beta1y(s)*gamma1(s) ) ppv=pse*p/(pse*p+(1-psp)*(1-p)) npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p) # return everything in a list # EFF: testing efficiency # PSE: pooling sensitivity # PSP: pooling specificity # PPV: pooling positive predictive value # NPV: pooling negative predictive value return(list(EFF=eff,PSE=pse,PSP=psp,PPV=ppv,NPV=npv)) } #***********************************************************************# # Final function # This function approximate EFF, PSE, PSP, PPV, NPV of A(k \times k) using four methods # i.e., it provides FOUR sets operating charatersitics of A(k \times k) # 1st set: consider biomarker pooling and fix threshold to be the threshold for individual testing # 2nd set: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size # 3rd set: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index # 4th set: under traditional assumption using methods provided by Kim et al. (2007) # for a given prevalence using B Monte Carlo replications # # Arguments: # A.size: = c(k,k). The configuration of the squared array A(k \times k) # p = disease prevalence # Pos.genr: see Part I # Neg.genr: see Part I # Mes.genr: see Part I # B: size of Monte Carlo simulation Sim.Array.all=function(S.size,p,Pos.genr,Neg.genr,Mes.genr,B) { Ind.res=IndTest(Pos.genr,Neg.genr,Mes.gner,B) tau.star=Ind.res$tau.star print("The threshold for individual testing is") print(tau.star) Se=Ind.res$Se Sp=Ind.res$Sp print("The sensitivity for individual testing is") print(Se) print("The specificity for individual testing is") print(Sp) OP.fixt0=Sim.Array(A.size,p,c(tau.star,tau.star,tau.star),Pos.genr,Neg.genr,Mes.genr,B) OP.t0byn=Sim.Array(A.size,p,c(tau.star/A.size[1],tau.star/A.size[2],tau.star),Pos.genr,Neg.genr,Mes.genr,B) OP.newth=Sim.Array(A.size,p,c(Thres(A.size[1],Pos.genr,Neg.genr,Mes.genr,B),Thres(A.size[2],Pos.genr,Neg.genr,Mes.genr,B),tau.star),Pos.genr,Neg.genr,Mes.genr,B) OP.tradi=Exact.Tradition.SArray.homo(A.size, p, Se, Sp) res.biomarker=cbind(OP.fixt0,OP.t0byn,OP.newth) res.traditional=cbind(OP.tradi) print("OP.fixt0: consider biomarker pooling and fix threshold to be the threshold for individual testing") print("OP.t0byn: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size") print("OP.newth: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index") print("OP.tradi: under traditional assumption using methods provided by Kim et al. (2007)") return(list(OP.biomk=res.biomarker,OP.tradi=res.traditional)) } ###################################################################### # An illustrative Example: # Using f-, f+, f_epsilon, provided by May et al. (2010) # We defined these distributions in Part I, please check above # Target algorithm: A(10 \times 10) # Prevlance p=0.05 # Monte Carlo size is 10,000. # Set preliminaries # Monte Carlo size B=10000 # We set the configure of the hierarchical algorithm by S.size A.size=c(10,10) # i.e., Array(10 \times 10) # The disease prevalence is p=0.05 set.seed(100) Sim.Array.all(A.size,p,Pos.genr,Neg.genr,Mes.genr,B) # output: #[1] "The threshold for individual testing is" #[1] 434.2249 #[1] "The sensitivity for individual testing is" #[1] 0.9896 #[1] "The specificity for individual testing is" #[1] 0.9808 #[1] "OP.fixt0: consider biomarker pooling and fix threshold to be the threshold for individual testing" #[1] "OP.t0byn: consider biomarker pooling and use threshold to be the threshold for individual testing divided by pool size" #[1] "OP.newth: consider biomarker pooling and use our newly proposed threshold to maximize the pooled version of Youden index" #[1] "OP.tradi: under traditional assumption using methods provided by Kim et al. (2007)" #$OP.biomk # OP.fixt0 OP.t0byn OP.newth #EFF 0.253256 0.755195 0.393226 #SD 0.05336722 0.171258 0.1218239 #PSE 0.4012 0.9897 0.8918 #PSP 0.9993 0.9804 0.9954 #PPV 0.9679131 0.7265986 0.9107435 #NPV 0.9694264 0.9994474 0.9943115 #$OP.tradi # OP.tradi #EFF 0.3849995 #PSE 0.9693037 #PSP 0.9972509 #PPV 0.9488677 #NPV 0.9983826