################################################
# This R code defines two functions
# hybrid.poisson.test: perform the analysis method of Pounds, Gao, and Zhang (2012, Stat Apps in Genetics and Molecular Biology)
# pval.hist: fit the adaptive histogram estimator to the p-values as described by Pounds, Gao, and Zhang
# Notes: This library borrows code from Rebecca Doerge (www.stat.purdue.edu/~doerge/software/TSPM.R)
################################################

####################
# Function: hybrid.poisson.test
# Purpose: Use the score test for overdispersion to choose a standard Poisson likelihood ratio test or a quasi-likelihood test for each gene
# Argument: See below
# Results: a data.frame, see the return line below
####################################

hybrid.poisson.test <- function(counts,                   # matrix of mRNA-seq counts, rows for genes and columns for subjects
                                x1,                       # vector of group labels, must have the same number of entries as X has columns
                                x0=rep(1,length(x1)),     # vector of factors for "null" model, defaults to all 1's for standard two-group comparison
                                lib.size=colSums(counts), # vector of normalizing constants, defaults to total number of counts for each subject 
                                use.offset=T)             # alpha value for Working-Hotelling multiplicity correction in Auer and Doerge (2011, Stat Apps in Genetics and Mole. Biology)
{
  
  ######## The main loop that fits the GLMs to each gene ########################
  
  ### Initializing model parameters ####
  n <- dim(counts)[1]
  n.samp<-dim(counts)[2]
  per.gene.disp <- NULL
  LRT <- NULL
  score.test <- NULL
  LFC <- NULL
  
  ###### Fitting the GLMs for each gene #################
  for(i in 1:n){
    ### Fit full and reduced models ###
    if (use.offset)
    {
      model.1 <- glm(as.numeric(counts[i,]) ~ x1, offset=log(lib.size), family=poisson)
      model.0 <- glm(as.numeric(counts[i,]) ~ x0, offset=log(lib.size), family=poisson)
    }
    else
    {
      model.1 <- glm(as.numeric(counts[i,]) ~ x1,  family=poisson)
      model.0 <- glm(as.numeric(counts[i,]) ~ x0,  family=poisson)    
    }
    
    ### Obtain diagonals of Hat matrix from the full model fit ###
    hats <- hatvalues(model.1)
    
    ### Obtain Pearson overdispersion estimate ####
    per.gene.disp[i] <- sum(residuals(model.1, type="pearson")^2)/model.1$df.residual
    
    ### Obtain Likelihood ratio statistic ####
    LRT[i] <- deviance(model.0)-deviance(model.1)
    
    ### Obtain score test statistic ####
    score.test[i] <- 1/(2*length(counts[i,])) * sum(residuals(model.1, type="pearson")^2 - ((counts[i,] - hats*model.1$fitted.values)/model.1$fitted.values))^2  # original code
    
    ### Obtain the estimated log fold change ###
    LFC[i] <- -model.1$coef[2]
  }
  
  ###### Compute p-values ####################################
  p.dispersion<-pchisq(score.test,df=1,lower.tail=FALSE)                          # test for overdispersion
  p.f <- pf(LRT/per.gene.disp, df1=1, df2=model.1$df.residual, lower.tail=FALSE)  # test for differential expression assuming overdispersion
  p.chi <- pchisq(LRT, df=1, lower.tail=FALSE)                                    # test for differential expression assuming standard Poisson model
  
  

  # Compute EBPs
  ebp.st.poisson<-pval.hist(p.dispersion)$h.ebp   # EBP that standard poisson model is correct
  ebp.null.od<-pval.hist(p.f)$h.ebp               # EBP of no differential expression assuming overdispersed poisson model
  ebp.null.st<-pval.hist(p.chi)$h.ebp             # EBP of no differential expression assuming standard poisson model
  
  ebp.null.wgt<-ebp.null.st*ebp.st.poisson+ebp.null.od*(1-ebp.st.poisson)
  
  p.best<-p.chi
  p.best[ebp.st.poisson<0.5]<-p.f[ebp.st.poisson<0.5]
  ebp.null.best<-pval.hist(p.best)$h.ebp
  
   
  ### Output ###
  return(cbind.data.frame(log.fold.change=LFC,                # log fold-change
                          lrt.stat=LRT,                       # likelihood ratio statistic from standard Poisson model
                          od.hat=per.gene.disp,               # dispersion parameter estimate
                          F.stat=LRT/per.gene.disp,           # F-statistic from quasi-likelihood test 
                          p.dispersion=p.dispersion,          # p-value for test of overdispersion
                          p.od.model=p.f,                     # p-value for differential expression assuming overdispersion
                          p.st.model=p.chi,                   # p-value for differential expression assuming standard Poisson model
                          p.best=p.best,                      # p-value from using EBP that standard Poisson model is correct to pick best test for each gene
                          ebp.st.poisson=ebp.st.poisson,      # EBP that standard Poisson model is correct
                          ebp.null.st=ebp.null.st,            # EBP of no differential expression assuming standard Poisson model is correct
                          ebp.null.od=ebp.null.od,            # EBP of no differential expression assuming overdispersed model
                          ebp.null.wgt=ebp.null.wgt,          # weighted average EBP of no differential expression from standard and overdispersed Poisson models
                          ebp.null.best=ebp.null.best))       # EBP of no differential expression using p.best
}


####################################
# Function: pval.hist
# Purpose: Use the adaptive histogram estimator to compute the EBP, FDR, etc.
# Arguments: p - a vector of p-values
# Returns: a list as documented in the return line below
################################

pval.hist<-function(p)
  
{
  m<-length(p)                            # count the p-values 
  h.cdf<-h.pdf<-p.edf<-rep(NA,length(p))  # initialize h.pdf and p.edf vectors
  ord<-order(p)                           # order the p-values
  p.edf[ord]<-(1:m)/(m+1)                 # define the EDF estimator
  hh<-NULL                                # initialize the histogram heights 
  F0<-p0<-b.edf<-p.brks<-1                # initialize vectors with F0, p0, EDF break-points, and p-value break-points
  
  while(any(p<p0))  # Loop until all p-values are included in the histogram
  {
    slp<-min(1,(2*sum(p[p<p0])+1)/(1+p0*m))  # deterime the average abundance of p-values (under conditional uniform)
    slp.line<-(1-slp*(p0-p)/p0)*F0           # represent this line as a straight line in the EDF  
    ok<-(p<p0)&(p.edf>=slp.line)             # find the p-values that are "OK"
    if (any(ok))                             # Insert a new histogram break-point
    {
      p.cut<-min(p[ok])                   # Find the minimum p-value that is OK
      p.brks<-c(p.cut,p.brks)             # Update the p-value break-point vector
      hh<-c(slp,hh)                       # Update the histogram heights vector 
      p0<-p.cut                           # Update p0
      F0<-min(p.edf[p>=p0])               # Update F0
      b.edf<-c(F0,b.edf)                  # Update the break-points in the EDF histogram estimator
    }
    else # inserts a (p,EDF) point at (0,0)
    {
      p.brks<-c(0,min(p),max(p[p<p0]),p.brks) 
      b.edf<-c(0,min(p.edf[p==min(p)]),min(p.edf[p==max(p[p<p0])]),b.edf)
      p0<-0
    }
  }
  
  last.pt<-min((1:length(p.brks))[p.brks==1])
  p.brks<-p.brks[1:last.pt]
  b.edf<-b.edf[1:last.pt]
  
  p.diffs<-diff(p.brks)
  zero.diffs<-(1:length(p.diffs))[p.diffs==0]
  if (length(zero.diffs)>0)
  {
    p.brks<-p.brks[-(zero.diffs+1)]
    b.edf<-b.edf[-(zero.diffs+1)]   
  }
  
  hh<-exp(log(diff(b.edf))-log(diff(p.brks)))  
  if (any(is.na(hh)))
  {
    print(summary(p))
    print(p.brks)
    print(b.edf)
    print(hh)
    stop("undefined histogram heights.")
    
  } 
  hd<-diff(hh)  # Difference in histogram heights
  p1<-1
  while(any(hd>0))  # Enforce monotonicity
  {
    k<-max((1:length(hd))[hd>0])                        # Find largest histogram bar index k such that bar k is shorter than bar k+1
    p.brks<-c(p.brks[1:k],p.brks[(k+2):length(p.brks)]) # combine histogram bars k and k+1
    b.edf<-c(b.edf[1:k],b.edf[(k+2):length(b.edf)])     # update EDF break-points with monotonocity constraint      
    hh<-exp(log(diff(b.edf))-log(diff(p.brks)))         # compute new histogram heights
    hd<-diff(hh)                                        # compare consecutive histogram heights
  }
  
  h.cdf<-approx(p.brks,b.edf,xout=p)$y                  # Get the histogram-based CDF estimates for each p-value
  p.hist<-exp(log(diff(b.edf))-log(diff(p.brks)))       # Get the height for each histogram bar
  pi0.hat<-min(p.hist)                                  # Get the pi0 estimate from the histogram
  h.ebp<-approx(p.brks,pi0.hat/c(p.hist,p.hist[length(p.hist)]),xout=p)$y # Get the conservative EBP interpolation
  h.fdr<-exp(log(min(pi0.hat))+log(p)-log(h.cdf))                         # Get the histogram-based FDR estimate
  h.ebp[p==0]=0
  h.fdr[p==0]=0
    
  return(list(p=p,              # input p-value vector
              edf=p.edf,        # the p-value edf
              h.cdf=h.cdf,      # the histogram-based cdf estimate
              h.fdr=h.fdr,      # the histogram-based FDR estimate
              h.ebp=h.ebp,      # the histogram-based EBP estimate
              p.brks=p.brks,    # the p-value break-points of the histogram
              p.hist=p.hist,    # the heights of each histogram bar
              edf.brks=b.edf,   # the break-points in the EDF of the histogram estimator
              pi0.hat=pi0.hat)) # the histogram-based estimate of the proportion of tests with a true null hypothesis
}