################################################ # This R code defines two functions # hybrid.poisson.test: perform the analysis method of Pounds, Gao, and Zhang (2012, Stat Apps in Genetics and Molecular Biology) # pval.hist: fit the adaptive histogram estimator to the p-values as described by Pounds, Gao, and Zhang # Notes: This library borrows code from Rebecca Doerge (www.stat.purdue.edu/~doerge/software/TSPM.R) ################################################ #################### # Function: hybrid.poisson.test # Purpose: Use the score test for overdispersion to choose a standard Poisson likelihood ratio test or a quasi-likelihood test for each gene # Argument: See below # Results: a data.frame, see the return line below #################################### hybrid.poisson.test <- function(counts, # matrix of mRNA-seq counts, rows for genes and columns for subjects x1, # vector of group labels, must have the same number of entries as X has columns x0=rep(1,length(x1)), # vector of factors for "null" model, defaults to all 1's for standard two-group comparison lib.size=colSums(counts), # vector of normalizing constants, defaults to total number of counts for each subject use.offset=T) # alpha value for Working-Hotelling multiplicity correction in Auer and Doerge (2011, Stat Apps in Genetics and Mole. Biology) { ######## The main loop that fits the GLMs to each gene ######################## ### Initializing model parameters #### n <- dim(counts)[1] n.samp<-dim(counts)[2] per.gene.disp <- NULL LRT <- NULL score.test <- NULL LFC <- NULL ###### Fitting the GLMs for each gene ################# for(i in 1:n){ ### Fit full and reduced models ### if (use.offset) { model.1 <- glm(as.numeric(counts[i,]) ~ x1, offset=log(lib.size), family=poisson) model.0 <- glm(as.numeric(counts[i,]) ~ x0, offset=log(lib.size), family=poisson) } else { model.1 <- glm(as.numeric(counts[i,]) ~ x1, family=poisson) model.0 <- glm(as.numeric(counts[i,]) ~ x0, family=poisson) } ### Obtain diagonals of Hat matrix from the full model fit ### hats <- hatvalues(model.1) ### Obtain Pearson overdispersion estimate #### per.gene.disp[i] <- sum(residuals(model.1, type="pearson")^2)/model.1$df.residual ### Obtain Likelihood ratio statistic #### LRT[i] <- deviance(model.0)-deviance(model.1) ### Obtain score test statistic #### score.test[i] <- 1/(2*length(counts[i,])) * sum(residuals(model.1, type="pearson")^2 - ((counts[i,] - hats*model.1$fitted.values)/model.1$fitted.values))^2 # original code ### Obtain the estimated log fold change ### LFC[i] <- -model.1$coef[2] } ###### Compute p-values #################################### p.dispersion<-pchisq(score.test,df=1,lower.tail=FALSE) # test for overdispersion p.f <- pf(LRT/per.gene.disp, df1=1, df2=model.1$df.residual, lower.tail=FALSE) # test for differential expression assuming overdispersion p.chi <- pchisq(LRT, df=1, lower.tail=FALSE) # test for differential expression assuming standard Poisson model # Compute EBPs ebp.st.poisson<-pval.hist(p.dispersion)$h.ebp # EBP that standard poisson model is correct ebp.null.od<-pval.hist(p.f)$h.ebp # EBP of no differential expression assuming overdispersed poisson model ebp.null.st<-pval.hist(p.chi)$h.ebp # EBP of no differential expression assuming standard poisson model ebp.null.wgt<-ebp.null.st*ebp.st.poisson+ebp.null.od*(1-ebp.st.poisson) p.best<-p.chi p.best[ebp.st.poisson<0.5]<-p.f[ebp.st.poisson<0.5] ebp.null.best<-pval.hist(p.best)$h.ebp ### Output ### return(cbind.data.frame(log.fold.change=LFC, # log fold-change lrt.stat=LRT, # likelihood ratio statistic from standard Poisson model od.hat=per.gene.disp, # dispersion parameter estimate F.stat=LRT/per.gene.disp, # F-statistic from quasi-likelihood test p.dispersion=p.dispersion, # p-value for test of overdispersion p.od.model=p.f, # p-value for differential expression assuming overdispersion p.st.model=p.chi, # p-value for differential expression assuming standard Poisson model p.best=p.best, # p-value from using EBP that standard Poisson model is correct to pick best test for each gene ebp.st.poisson=ebp.st.poisson, # EBP that standard Poisson model is correct ebp.null.st=ebp.null.st, # EBP of no differential expression assuming standard Poisson model is correct ebp.null.od=ebp.null.od, # EBP of no differential expression assuming overdispersed model ebp.null.wgt=ebp.null.wgt, # weighted average EBP of no differential expression from standard and overdispersed Poisson models ebp.null.best=ebp.null.best)) # EBP of no differential expression using p.best } #################################### # Function: pval.hist # Purpose: Use the adaptive histogram estimator to compute the EBP, FDR, etc. # Arguments: p - a vector of p-values # Returns: a list as documented in the return line below ################################ pval.hist<-function(p) { m<-length(p) # count the p-values h.cdf<-h.pdf<-p.edf<-rep(NA,length(p)) # initialize h.pdf and p.edf vectors ord<-order(p) # order the p-values p.edf[ord]<-(1:m)/(m+1) # define the EDF estimator hh<-NULL # initialize the histogram heights F0<-p0<-b.edf<-p.brks<-1 # initialize vectors with F0, p0, EDF break-points, and p-value break-points while(any(p=slp.line) # find the p-values that are "OK" if (any(ok)) # Insert a new histogram break-point { p.cut<-min(p[ok]) # Find the minimum p-value that is OK p.brks<-c(p.cut,p.brks) # Update the p-value break-point vector hh<-c(slp,hh) # Update the histogram heights vector p0<-p.cut # Update p0 F0<-min(p.edf[p>=p0]) # Update F0 b.edf<-c(F0,b.edf) # Update the break-points in the EDF histogram estimator } else # inserts a (p,EDF) point at (0,0) { p.brks<-c(0,min(p),max(p[p0) { p.brks<-p.brks[-(zero.diffs+1)] b.edf<-b.edf[-(zero.diffs+1)] } hh<-exp(log(diff(b.edf))-log(diff(p.brks))) if (any(is.na(hh))) { print(summary(p)) print(p.brks) print(b.edf) print(hh) stop("undefined histogram heights.") } hd<-diff(hh) # Difference in histogram heights p1<-1 while(any(hd>0)) # Enforce monotonicity { k<-max((1:length(hd))[hd>0]) # Find largest histogram bar index k such that bar k is shorter than bar k+1 p.brks<-c(p.brks[1:k],p.brks[(k+2):length(p.brks)]) # combine histogram bars k and k+1 b.edf<-c(b.edf[1:k],b.edf[(k+2):length(b.edf)]) # update EDF break-points with monotonocity constraint hh<-exp(log(diff(b.edf))-log(diff(p.brks))) # compute new histogram heights hd<-diff(hh) # compare consecutive histogram heights } h.cdf<-approx(p.brks,b.edf,xout=p)$y # Get the histogram-based CDF estimates for each p-value p.hist<-exp(log(diff(b.edf))-log(diff(p.brks))) # Get the height for each histogram bar pi0.hat<-min(p.hist) # Get the pi0 estimate from the histogram h.ebp<-approx(p.brks,pi0.hat/c(p.hist,p.hist[length(p.hist)]),xout=p)$y # Get the conservative EBP interpolation h.fdr<-exp(log(min(pi0.hat))+log(p)-log(h.cdf)) # Get the histogram-based FDR estimate h.ebp[p==0]=0 h.fdr[p==0]=0 return(list(p=p, # input p-value vector edf=p.edf, # the p-value edf h.cdf=h.cdf, # the histogram-based cdf estimate h.fdr=h.fdr, # the histogram-based FDR estimate h.ebp=h.ebp, # the histogram-based EBP estimate p.brks=p.brks, # the p-value break-points of the histogram p.hist=p.hist, # the heights of each histogram bar edf.brks=b.edf, # the break-points in the EDF of the histogram estimator pi0.hat=pi0.hat)) # the histogram-based estimate of the proportion of tests with a true null hypothesis }