######################################################################################################## # Filtering Library # This code implements the pooled p-value and error-minimizing pooled p-value filters as described by # Pounds and Cheng (2005) Statistical Development and Evaluation of Gene Expression Data Filters. Journal of Computational Biology. # # Three functions are defined in this code: # probe.filter - apply the pooled p-value or error-minimizing pooled p-value filter to a set # of Affymetrix present-absent p-values # pooled.p - compute pooled p-values given the present-absent p-values # splosh0 - called by probe.filter, applies the SPLOSH algorithm as part of the EMPP filter ####################################################################################################### ################################################################################################################# # Function: probe.filter # Purpose: Apply the pooled p-value or EMPP filter as described by Pounds and Cheng (2005 J. Comp. Biology) # # Arguments: P - an (n x g) matrix giving the Affy present-absent p-values with # rows representing n chips and columns representing g probe sets # grp - a vector of length n giving the experimental group memberships of the n chips # alpha - the cut-off for pooled p-values, default = "MTE" for the error-minimizing pooled p-value filter # opts - list of options for the SPLOSH algorithm, defaults provided, see comments on splosh0 function below # # Returns: a list with the following components # include - a boolean that indicates whether the corresponding probe should be included in further analysis # uniq.grps - names of the experimental groups # alpha - vector or scalar giving the inclusion-threshold for the pooled p-value for each of the experimental groups # pooled.p - matrix of pooled p-values for each probe (columns) and each experimental group (rows) # ###################################################################################################################### probe.filter<-function(P,grp,alpha="MTE",delta=0.01,opts=list(LC=loess.control(),dplaces=4)) { n <- dim(P)[1] g <- dim(P)[2] grp<-as.character(grp) ug<-unique(grp) ng<-length(ug) L<-matrix(0,ng,n) for (i in 1:ng) L[i,]<-(grp==ug[i]) m<-L%*%rep(1,n) p<-matrix(1-pgamma(-L%*%log(P),m,1),ng,g) if (alpha=="MTE") { alpha<-rep(NA,ng) for (i in 1:ng) { sub<-((p[i,]>delta)+(p[i,]<(1-delta)))==2 res<-splosh0(p[i,sub],opts$LC,opts$dplaces) alpha[i]<-max(p[i,sub][res$toterr==min(res$toterr)]) } } inc0<-(p<=alpha) include<-(t(rep(1,ng))%*%(p<=alpha))>0 return(list(include=as.vector(include),uniq.grps=ug,alpha=alpha,pooled.p=p)) } ########################################################################## # Function: pooled.p # Purpose: Compute pooled p-values # Arguments: P - a matrix giving the Affy present-absent p-values with # rows representing n chips and columns representing g probe sets # Returns: p - a vector of pooled p-values for each of the g probe sets ########################################################################## pooled.p<-function(P) { n <- dim(P)[1] g <- dim(P)[2] ones <- rep(1,n) y <- as.vector(-ones%*%log(P)) p <- 1-pgamma(y,n,1) return(p) } ######################################################################################################## # Function name: splosh0 # Arguments: pvals - a vector of p-values obtained by applying a statistical hypothesis # test to the expression values for each gene individually # LC - (optional) a list containing tuning parameters for the loess algorithm, # see help(loess.control) for more details, defaults to S-plus defaults # dplaces - (optional) round p-values to this many decimal places to avoid # numerical difficulties that can arise by division by small numbers, default = 6 # notes - (optional) may pass any string describing notes regarding the analysis, default = NULL # # Ouputs: a list with the following elements # cfdr - a vector of the cfdr estimates at the values specified by the user in pvals # q - a vector of corresponding monotone fdr estimates, similar to Storey's q-value # cdf - vector of the SPLOSH cdf estimate evaluated at pvals # fp - vector of estimated no. of false positives incurred by setting threshold # at corresponding value of pvals # fn - vector of estimated no. of false negatives incurred by setting threshold # at corresponding value of pvals # toterr - vector giving the sum of fp and fn for each entry # ebp - the empirical Bayes posterior that the null hypothesis is true, see Efron et al. (JASA 2001) # pi - estimate of the mixing parameter giving the probability of the null hypothesis # ord - a vector giving index numbers to order results according to ascending p-values # dplaces - echoes value of dplaces used in algorithm # LC - echoes value of LC used in algorithm # fdr.method - gives bibliographic reference for the method # notes - returns notes regarding the analysis ########################################################################################################### splosh0<-function(pvals,LC=loess.control(),dplaces=4,notes=NULL) { notes<-c(notes,"SPLOSH used to estimate FDR.") x<-round(pvals,dplaces) # Prevent numerical difficulties n<-length(pvals) # Obtain no. of points r<-rank(x) # rank observations ux<-sort(unique(x)) # ordered unique p-values ur<-(sort(unique(r)))-.5 # ordered unique ranks if (max(ux)<1) # edge adjustments { ux<-c(ux,1) ur<-c(ur,n) } if (min(ux)>0) { ux<-c(0,ux) ur<-c(0,ur) } nux<-length(ux) # No. of unique points ur<-ur/n # Divide ranks by n dx<-diff(ux) # spacings of unique p-values dr<-diff(ur) # spacings of unique ranks dFdx<-exp(log(dr)-log(dx)) # spacing-interval slopes mr<-(ur[1:(nux-1)]+ur[2:nux])/(2) # Unitized Medians of Unique Rank Intervals mx<-(ux[1:(nux-1)]+ux[2:nux])/2 # Unitized Medians of Unique p-value Intervals tr<-asin(2*(mr-.5)) # Arc-Sine Transform Rank Midpoints fit<-loess(log(dFdx)~tr,loess.control=LC) # Lowess of log-transformed EDQF and transformed rank-midpoints tr2<-asin(2*(ur-.5)) yhat<-(predict.loess(fit,tr2)) # Obtain Estimated Derivative of Quantile Function yhat[c(1,nux)]<-approx(tr2[2:(nux-1)], yhat[2:(nux-1)], xout=tr2[c(1,nux)], rule=3)$y # Extrapolate to get endpoint estimates pdf1<-exp(yhat) # Obtain PDF up to a constant trap<-0.5*(pdf1[1:(nux-1)]+pdf1[2:nux])*dx # Trapezoid rule terms const<-sum(trap) # Constant via trapezoid rule pdf2<-pdf1/const # Adjust pdf by constant pdf<-approx(ux,pdf2,xout=pvals,rule=3)$y # Obtain pdf estimate for p-values cdf<-approx(ux[2:nux], cumsum(trap), xout=pvals, rule=2)$y/const # Obtain CDF pi<-min(pdf) # Take pi as min of PDF p.pi<-max(pvals[pdf==pi]) # p-value at which pi occurs cfdr<-pi*pvals/cdf # Estimate cFDR for p-values ebp<-pi/pdf # Estimate EBP for p-values cfdr[pvals==0]<-ebp[pvals==0] # L'Hospital's Rule cfdr[cfdr>1]<-1 # Set max(fdr)=1 qval<-cfdr # Initialize q-value vector ordering<-order(pvals) # Report Results qval[rev(ordering)]<-cummin(qval[rev(ordering)]) # Compute q-value fp<-n*cfdr*cdf fn<-n*(1-cdf-pi*(1-pvals)) fn[pvals>=p.pi]<-0 toterr<-fp+fn return(list(cfdr=matrix(cfdr,n,1), # cfdr estimate q=matrix(qval,n,1), # q-value estimate cdf=matrix(cdf,n,1), # cdf estimate fp=matrix(fp,n,1), # estimate of no. of false postives fn=matrix(fn,n,1), # estimate of no. of false negatives toterr=matrix(toterr,n,1), # estimated total no. of errors pdf=matrix(pdf,n,1), # estimate of pdf ebp=matrix(ebp,n,1), # estimate of ebp pi=pi, # estimate of null proportion p.pi=p.pi, # p-value at which estimated pdf achieves minimum ord=ordering, # ordering vector dplaces=dplaces, # rounded to this no. of decimal places LC=LC, # loess.control options used fdr.method="Pounds and Cheng (2004)", # Bibliographic Information notes=notes)) # Notes regarding analysis }