##################################### # BUM S-plus function library # Author: Stan Pounds # Created: September 13, 2002 # Converted to R by Guanchung Song Nov 6, 2003 # About this library: This library implements the BUM procedures for estimating # the occurrence of errors in microarray analysis. For more # information see Pounds and Morris (2003) "Estimating the # occurrence of false positives and false negatives in # microarray studies by approximating and partitioning the # empirical distribution of p-values." Bioinformatics, 19, 368-375. # Disclaimer: Any damages arising from use of this software are NOT # the responsibility of its developer, Stan Pounds, or # his employer, St. Jude Children's Research Hospital. ############################################################################## ############################################################################## # Functions in this library # dbum - calculate the density of the BUM distribution # pbum - calculate the cdf of the BUM distribution # qbum - calculate the quantile of the BUM distribution # rbum - generate random BUM observations # dalt - calculate the density of the alternative component of a BUM # palt - calculate the cdf of the alternative component of the BUM # qalt - calculate the quantile of the alternative component of the BUM # bum.logL - calculate the log-likelihood for the BUM model for a set # of p-values # logit - compute the logit # inv.logit - compute the inverse of the logit # neg.bum.logL - negative log-likelihood of the BUM model for a set of p-values # bum.mle - find the MLE for the BUM parameters # qqbum - produce a BUM quantile-quantile plot # bum.histogram - produce a histogram of p-values and compare to BUM MLE curve # ext.pi - extract the uniform component from a BUM distribution # bum.emp.post - compute the empirical Bayes' posterior for a given BUM model # bum.FDR - compute the false discovery rate for a given BUM model # bum.false.positive - estimate the proportion of false positives committed # bum.true.positive - estimate the proportion of true positives committed # bum.false.negative - estimate the proportion of false negatives committed # bum.true.negative - estimate the proportion of true negatives # bum.weighted.error - estimate a weighted sum of the proportion of # false negatives and false positives # find.FDR.threshold - find the p-value threshold corresponding to a desired false # discovery rate # find.EB.threshold - find the p-value threshold corresponding to a desired # empirical Bayes' posterior # inv.dbum - the mathematical inverse of the BUM density # find.WE.threshold - find the p-value threshold that minimizes the weighted # sum of the estimated proportion of false positives # and estimated proportion of false negatives # bum.error.diagram - produce a diagram illustrating the occurrence of # true positives, true negatives, false positives, and # false negatives in the analysis # cont.hyp.test - test for the presence of an alternative distribution # among a set of continuous p-values # perm.LRT - test for the presence of an alternative distribution among # a set of p-values generated by permutation # pvalue.table - output a table of p-values and error control quantities # for each probe to a specified file ############################################################################## ########################################################################## # Function: dbum # Purpose: Compute the pdf of the bum distribution # Arguments: x - point or vector of points at which to compute pdf # a - shape parameter of beta component # lambda - mixture parameter, proportion of uniform component # Returns: value of the pdf of the bum distribution for x ########################################################################## dbum<-function(x,a,lambda) { return(lambda+(1-lambda)*a*x^(a-1)) } ########################################################################## # Function: pbum # Purpose: Compute the cdf of the bum distribution # Arguments: x - point or vector of points at which to compute the pdf # a - shape parameter of the beta component # lambda - mixing parameter, weight of uniform component # Returns: value of the cdf of the bum distribution for x ########################################################################## pbum<-function(x,a,lambda) { return(lambda*x+(1-lambda)*x^a) } ########################################################################## # Function: qbum # Purpose: Compute the quantile of the bum distribution # Arguments: p - percentile or vector of percentiles # a - shape parameter of the beta component # lambda - mixing parameter, weight of uniform component # nbisect - the number of bisections to perform # Returns: the values x such that the pdf of x equals the percentiles # Notes: Uses the bisection method to find quantiles, a larger value of # nbisect will result in more accurate results. The results will # be accurate to within 2^(-nbisect). ########################################################################## qbum<-function(p,a,lambda,nbisect=20) { n<-length(p) mid<-rep(0,n) top<-rep(1,n) bot<-rep(0,n) gohigher<-rep(F,n) for (j in 1:nbisect) { mid<-(top+bot)/2 gohigher<-(pbum(mid,a,lambda)0) { starta<-logit(runif(nstartpts)) startlambda<-logit(runif(nstartpts)) } adjusted<-min(pvals)<=0 if (adjusted) pvals<-(nadj*pvals+adjeps)/(nadj+2*adjeps) bestlogL<- -Inf nstartpts<-length(starta) for (i in 1:nstartpts) { results <- nlm( neg.bum.logL, c(starta[i],startlambda[i]),pvals=pvals ) # OK in R # name of value in R results$minimum as results$objective in S+ if ( -results$minimum > bestlogL) { a<-inv.logit(results$estimate[1]) # parameters[1]) # S+ lambda<-inv.logit(results$estimate[2]) # parameters[2]) # S+ logL<- -results$minimum nits<- results$iterations termination <- results$code # message # S+ bestlogL<-logL } } return(list(a=a,lambda=lambda,logL=logL,pvals.adjusted=adjusted,nstartpts=nstartpts,nits=nits,termination=termination)) } ######################################################################### # Function: qqbum # Purpose: Produce a quantile-quantile plot for a set of p-values # Arguments: pvals - a vector of p-values # a - shape parameter for beta component of bum distribution # lambda - mixing parameter for BUM distribution # main - primary plot tile, default = "BUM QQ Plot" # xlab - label of x-axis, default = "BUM Expected p-value" # ylab - label of y-axis, default = "Observed p-value" # Returns: a quantile-quantile plot # Notes: If values of a and lambda are not provided, bum.mle # will be used to estimate a and lambda. May take a few minutes. ######################################################################### qqbum<-function(pvals,a=NULL,lambda=NULL,main="BUM QQ Plot",xlab="BUM Expected p-value",ylab="Observed p-value",nstartpts=0,starta=0,startlambda=0) { n<-length(pvals) pvals<-sort(pvals) if(is.null(a)||is.null(lambda)) { al<-bum.mle(pvals,nstartpts=nstartpts,starta=starta,startlambda=startlambda) plot(c(0,1),c(0,1),main=main,xlab=xlab,ylab=ylab,type="n") lines(qbum((rank(pvals)-.5)/n,al$a,al$lambda),pvals,lty=2) lines(c(0,1),c(0,1)) } else { plot(c(0,1),c(0,1),main=main,xlab=xlab,ylab=ylab,type="n") lines(qbum((rank(pvals)-.5)/n,a,lambda),pvals,lty=2) lines(c(0,1),c(0,1)) } } ######################################################################### # Function: bum.histogram # Purpose: Compare the fitted bum curve to the histogram # Arguments: pvalues - vector of p-values # a - a for bum curve, default = estimated MLE # lambda - lambda for bum curve, default = estimated MLE # main - primary title of plot, default = "Histogram" # xlab - label of x-axis, default = "p-value # ylab - label of y-axis, default = "Density" ######################################################################### bum.histogram<-function(pvalues,a=NA,lambda=NA,main="Histogram",xlab="p-value",ylab="Density",nstartpts=0,starta=0,startlambda=0) { if(is.na(a)||is.na(lambda)) { MLE<-bum.mle(pvalues,nstartpts=nstartpts,starta=starta,startlambda=startlambda) a<-MLE$a lambda<-MLE$lambda } hist(pvalues,probability=T,main=main,xlab=xlab,ylab=ylab) x<-1:100/100 lines(x,dbum(x,a,lambda),lwd=3) } ######################################################################### # Function: ext.pi # Purpose: Extract the maximal uniform componet from a bum density # Arguments: a - shape parameter of beta component of bum distribution # lambda - mixing parameter, component of bum that is uniform # Returns: the proportion of the density that can be extracted as a uniform ######################################################################### ext.pi<-function(a,lambda) { lambda[(lambda>1)+(lambda<0)>0]<-NA a[(a>1)+(a<0)]<-NA return((lambda+(1-lambda)*a)) } ######################################################################### # Function: bum.emp.post # Purpose: Compute the upper bound of BUM based empirical Bayes posterior # probability of the alternative hypothesis # Arguments: x - the point or vector of points at which to compute EB post # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: the upper bound of the BUM based empirical Bayes posterior at x # Notes: Computes an upper bound because maximal extraction of uniform # is performed. ######################################################################### bum.emp.post<-function(x,a,lambda) { pi<-ext.pi(a,lambda) return((dbum(x,a,lambda)-pi)/dbum(x,a,lambda)) } ######################################################################### # Function: bum.FDR # Purpose: Compute an estimated upper bound for the false discovery rate # when significance is determined by comparing a p-value to # a threshold tau # Arguments: tau - the threshold of comparison (point or vector) # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: Estimated upper bound of FDR # Notes: FDR is the expected proportion of rejections that are false # positives, or Type I errors ######################################################################### bum.FDR<-function(tau,a,lambda) { pii <- ext.pi(a,lambda) return(tau*pii/pbum(tau,a,lambda)) } ######################################################################### # Function: bum.false.positive # Purpose: Compute an estimated upper bound for the proportion of all tests # that will be false positives when significance is determined # by comparing the p-value to a threshold tau # Arguments: tau - the threshold of comparison (point or vector) # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: Estimated upper bound of proportion of all tests that are # false positives ######################################################################### bum.false.positive<-function(tau,a,lambda) { pi<-ext.pi(a,lambda) return(tau*pi) } ######################################################################### # Function: bum.false.negative # Purpose: Compute an estimated lower bound for the proportion of all # tests resulting in false negatives when significance is # determined by comparing the p-value to a threshold tau # Arguments: tau - threshold # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: Estimated lower bound for proportion of all tests resulting # false negatives (Type II errors) ######################################################################### bum.false.negative<-function(tau,a,lambda) { pi<-ext.pi(a,lambda) return(1-pbum(tau,a,lambda)-pi*(1-tau)) } ######################################################################### # Function: bum.true.positive # Purpose: Compute an estimated lower bound for the proportion of all # tests resulting in true positives when significance is # determined by comparing the p-value to a threshold tau # Arguments: tau - threshold # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: Estimated lower bound for proportion of all tests resulting # true positives ######################################################################### bum.true.positive<-function(tau,a,lambda) { pi<-ext.pi(a,lambda) return(pbum(tau,a,lambda)-pi*tau) } ######################################################################### # Function: bum.true.negative # Purpose: Compute an estimated lower bound for the proportion of all # tests resulting in true negatives when significance is # determined by comparing the p-value to a threshold tau # Arguments: tau - threshold # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: Estimated lower bound for proportion of all tests resulting # true negatives ######################################################################### bum.true.negative<-function(tau,a,lambda) { pi<-ext.pi(a,lambda) return(pi*(1-tau)) } ######################################################################### # Function: bum.weighted.error # Purpose: Compute a linear combination of the upper bound for the # proportion of false positives and the lower bound for the # proportion of false negatives resulting when significance # is determined by comparing p-values to a threshold # Arguments: tau - threshold # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # wfp - weight of the false positives # wfn - weight of the false negatives # Returns: the weighted combination of error rates ######################################################################### bum.weighted.error<-function(tau,a,lambda,wfp=1,wfn=1) { return(wfp*bum.false.positive(tau,a,lambda)+wfn*bum.false.negative(tau,a,lambda)) } ######################################################################### # Function: find.FDR.threshold # Purpose: Find a p-value threshold so that all p-values less than the # threshold have an FDR lower than a desired level # Arguments: fdr - desired fdr # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: p-value threshold that maintains the desired fdr ######################################################################### find.FDR.threshold<-function(fdr,a,lambda) { pi<-ext.pi(a,lambda) return(((pi-fdr*lambda)/(fdr*(1-lambda)))^(1/(a-1))) } ######################################################################### # Function: find.EB.threshold # Purpose: Find a p-value threshold so that all p-values less than the # threshold have an empirical Bayes (EB) posterior probability # greater than a desired level # Arguments: EB - desired empirical Bayes' posterior # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: p-value threshold that maintains the desired EB posterior ######################################################################### find.EB.threshold<-function(EB,a,lambda) { pi<-ext.pi(a,lambda) return(((EB*lambda+a*(1-lambda))/(a*(1-EB)*(1-lambda)))^(1/(a-1))) } ########################################################################## # Function: inv.dbum # Purpose: Compute the inverse of the pdf of the bum distribution # Arguments: y - value of the pdf of the bum of interest # a - shape parameter of the beta component # lambda - mixing parameter, weight of uniform component # nbisect - the number of bisections to perform # Returns: x so that pbum(x,a,lambda) = y # Notes: Uses the bisection method. The results will # be accurate to within 2^(-nbisect). Used by find.WE.threshold. ########################################################################## inv.dbum<-function(y,a,lambda,nbisect=20) { n<-length(y) mid<-rep(0,n) for (i in 1:n) { top<-1 bot<-0 for (j in 1:nbisect) { mid[i]<-mean(c(top,bot)) if(dbum(mid[i],a,lambda)>y[i]) bot<-mid[i] else top<-mid[i] } } return(mid) } ######################################################################### # Function: find.WE.threshold # Purpose: Compute the significance threshold for p-values so that # to obtain an optimal weighted error comparison # Arguments: a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # wfp - weight of false positives # wfn - weight of false negatives # nbisect - number of bisections for inv.dbum # Returns: a p-value threshold that optimizes the weighted error function ######################################################################### find.WE.threshold<-function(a,lambda,wfp=1,wfn=1,nbisect=20) { pi<-ext.pi(a,lambda) return(inv.dbum((wfp+wfn)*pi/wfn,a,lambda,nbisect)) } ######################################################################### # Function: bum.error.diagram # Purpose: Produce a bum.error.diagram # Arguments: tau - significance threshold for p-values # a - shape parameter of beta component of bum distribution # lambda - mixing parameter, proportion uniform in bum dist # Returns: an error region diagram ######################################################################### bum.error.diagram<-function(tau,a,lambda,xmin=NULL,title=NULL,showkey=F) { pi<-ext.pi(a,lambda) if (is.null(xmin)) xmin<-.1*tau if (is.null(title)) title<-"Error Regions" x<-seq(from=xmin,to=1,length=1000) ymax<-max(dbum(x,a,lambda)) plot(c(0,1),c(0,dbum(x[1],a,lambda)),main=title,xlab="p-value",ylab="Density",type="n") polygon(c(0,tau,tau,0),c(pi,pi,0,0),col=2) polygon(c(tau,1,1,tau),c(pi,pi,0,0),col=5) polygon(c(0,x[x<=tau],tau,tau,0),c(dbum(x[1],a,lambda),dbum(x[x<=tau],a,lambda),dbum(tau,a,lambda),pi,pi),col=3) polygon(c(tau,x[x>tau],1,tau),c(dbum(tau,a,lambda),dbum(x[x>tau],a,lambda),pi,pi),col=4) text(-.02,pi,"p",font=8) text(tau,-.01*ymax,"t",font=8) lines(x,dbum(x,a,lambda)) lines(c(0,1),c(pi,pi)) lines(c(tau,tau),c(0,dbum(tau,a,lambda))) lines(c(0,1),c(0,0)) lines(c(0,0),c(0,dbum(x[1],a,lambda))) lines(c(1,1),c(0,pi)) if(showkey) { text(.8,ymax,"p",font=8) text(.87,ymax,paste(" = ",round(pi,4))) text(.8,.95*ymax,"t",font=8) text(.87,.95*ymax,paste(" = ",round(tau,4))) text(.85,.9*ymax,paste("FP <=",round(bum.false.positive(tau,a,lambda),4)),col=2) text(.85,.85*ymax,paste("FN >=",round(bum.false.negative(tau,a,lambda),4)),col=4) text(.85,.80*ymax,paste("TP >=",round(bum.true.positive(tau,a,lambda),4)),col=3) text(.85,.75*ymax,paste("TN <=",round(bum.true.negative(tau,a,lambda),4)),col=5) } } ####################################################################################### # Function: cont.hyp.test # Purpose: Test for significant deviation from null hypothesis for a set of # p-values with continuous distribution # Argument: pvals - a vector of continuous p-values # Returns : a test statistic and corresponding p-value ####################################################################################### cont.hyp.test<-function(pvals) { mnlog<-mean(-log(pvals)) return(list(neglogpvalbar=mnlog,pval=1-pgamma(sum(-log(pvals)),length(pvals),1))) } ####################################################################################### # Function: perm.LRT # Purpose: Perform the likelihood ratio test (LRT) for a set of p-values obtained # by permutation # Arguments: pvals - a vector of p-values # nperms - the number of permutations used to compute the p-values # Returns: LR chi-square and corresponding p-value ######################################################################################## perm.LRT<-function(pvals,nperms) { counts<-rep(0.0,(nperms+1)) ngenes<-length(pvals) for (i in 0:nperms) counts[i+1]<-sum(nperms*pvals==i) LR<-sum(counts*(log(counts)-log(ngenes)))+ngenes*log(nperms+1) return(list(LR=LR,df=nperms,pval=1-pchisq(2*LR,nperms))) } ######################################################################################### # Function: pvalue.table # Purpose: produce a table of p-values and estimated error quantities for a set of probe # names or gene names # Arguments: filename - the output filename for the table # test.stat - the vector of test statistic values # pvalue - the vector of p-values (length g) # gene.names - a gx1 matrix of gene (or probe) names, where g = no. genes or probes # comments - a gx1 matrix of comments associated with the probes # amle - the MLE of a, if not provided, it will be estimated # lambdamle - the MLE of lambda, if not provided it will be estimated # adjn - the number used in adjusting p-values when necessary, if p-values # are generated by permutation, set adjn= no. of permutations performed # adjeps - the adjustment epsilon, p-values are adjusted by # adj.pvalue <- (adjn*pvalue+adjeps)/(adjn+2*adjeps) # Returns: the MLE for a, lambda, and proportion following the null hypothesis # Effects: Writes a table to the file filename # Notes: Adjustment of p-values is necessary when some p-values are zero or evaluate # to zero. The bum.mle function will crash when some p-values are zero. ########################################################################################### pvalue.table<-function(filename,test.stat,pvalue,gene.names,comments,amle=NA,lambdamle=NA,adjn=100000,adjeps=0.5,nstartpts=0,starta=0,startlambda=0) { minpvalue<-min(pvalue) n<-length(pvalue) if(is.vector(gene.names)) gene.names<-matrix(gene.names,n,1) if(is.vector(comments)) comments<-matrix(comments,n,1) if(is.na(amle)||is.na(lambdamle)) { if(minpvalue>0) { MLE<-bum.mle(pvalue,nstartpts=nstartpts,starta=starta,startlambda=startlambda) amle<-MLE$a lambdamle<-MLE$lambda } else { adj.pvalue<-(adjn*pvalue+adjeps)/(adjn+2*adjeps) MLE<-bum.mle(adj.pvalue,starta=starta,startlambda=startlambda) amle<-MLE$a lambdamle<-MLE$lambda } } ordering<-order(pvalue) if(minpvalue>0) { X<-cbind(gene.name=gene.names[,1], test.stat=test.stat, pvalue=pvalue, FDR=bum.FDR(pvalue,amle,lambdamle), EBP=bum.emp.post(pvalue,amle,lambdamle), comments=comments[,1]) } else { X<-cbind(gene.name=gene.names[,1], test.stat=test.stat, pvalue=pvalue, adj.pvalue=adj.pvalue, FDR=bum.FDR(adj.pvalue,amle,lambdamle), EBP=bum.emp.post(adj.pvalue,amle,lambdamle), comments=comments[,1]) } write.table(X[ordering,],filename,sep="\t") return(list(a=amle,lambda=lambdamle,pi=ext.pi(amle,lambdamle))) } bum.qrmse<-function(pval,a,lambda,nbisect=20) { sortp<-sort(pval) n<-length(pval) return(sqrt(sum((sortp-qbum((1:n-.5)/n,a,lambda,nbisect))^2)/(n-1))) }