########################################################################################################
# Filtering Library
# This code implements the pooled p-value and error-minimizing pooled p-value filters as described by
# Pounds and Cheng (2005) Statistical Development and Evaluation of Gene Expression Data Filters. Journal of Computational Biology.
# 
# Three functions are defined in this code:
# probe.filter - apply the pooled p-value or error-minimizing pooled p-value filter to a set
#                of Affymetrix present-absent p-values
# pooled.p     - compute pooled p-values given the present-absent p-values
# splosh0      - called by probe.filter, applies the SPLOSH algorithm as part of the EMPP filter
#######################################################################################################

#################################################################################################################
# Function: probe.filter
# Purpose: Apply the pooled p-value or EMPP filter as described by Pounds and Cheng (2005 J. Comp. Biology)
#
# Arguments: P - an (n x g) matrix giving the Affy present-absent p-values with
#                rows representing n chips and columns representing g probe sets
#            grp - a vector of length n giving the experimental group memberships of the n chips
#            alpha - the cut-off for pooled p-values, default = "MTE" for the error-minimizing pooled p-value filter
#            opts - list of options for the SPLOSH algorithm, defaults provided, see comments on splosh0 function below
#
# Returns: a list with the following components
#          include - a boolean that indicates whether the corresponding probe should be included in further analysis
#          uniq.grps - names of the experimental groups
#          alpha - vector or scalar giving the inclusion-threshold for the pooled p-value for each of the experimental groups
#          pooled.p - matrix of pooled p-values for each probe (columns) and each experimental group (rows)
#
######################################################################################################################

probe.filter<-function(P,grp,alpha="MTE",delta=0.01,opts=list(LC=loess.control(),dplaces=4))

{
	n <- dim(P)[1]
	g <- dim(P)[2]
	grp<-as.character(grp)
	ug<-unique(grp)
	ng<-length(ug)
	L<-matrix(0,ng,n)
	for (i in 1:ng)
		L[i,]<-(grp==ug[i])
	m<-L%*%rep(1,n)
	p<-matrix(1-pgamma(-L%*%log(P),m,1),ng,g)
	if (alpha=="MTE")
	{
		alpha<-rep(NA,ng)
		for (i in 1:ng)
		{
			sub<-((p[i,]>delta)+(p[i,]<(1-delta)))==2
			res<-splosh0(p[i,sub],opts$LC,opts$dplaces)
			alpha[i]<-max(p[i,sub][res$toterr==min(res$toterr)])
		}
	}
	inc0<-(p<=alpha)
	include<-(t(rep(1,ng))%*%(p<=alpha))>0
	return(list(include=as.vector(include),uniq.grps=ug,alpha=alpha,pooled.p=p))
}

##########################################################################
# Function: pooled.p
# Purpose: Compute pooled p-values
# Arguments: P - a matrix giving the Affy present-absent p-values with
#                rows representing n chips and columns representing g probe sets
# Returns: p - a vector of pooled p-values for each of the g probe sets
##########################################################################

pooled.p<-function(P)

{
	n <- dim(P)[1]
	g <- dim(P)[2]
	ones <- rep(1,n)
	y <- as.vector(-ones%*%log(P))
	p <- 1-pgamma(y,n,1)
	return(p)
}

########################################################################################################
# Function name: splosh0
# Arguments: pvals - a vector of p-values obtained by applying a statistical hypothesis
#                    test to the expression values for each gene individually
#            LC - (optional) a list containing tuning parameters for the loess algorithm, 
#                 see help(loess.control) for more details, defaults to S-plus defaults
#            dplaces - (optional) round p-values to this many decimal places to avoid
#                      numerical difficulties that can arise by division by small numbers, default = 6
#            notes - (optional) may pass any string describing notes regarding the analysis, default = NULL
#
# Ouputs: a list with the following elements
#         cfdr - a vector of the cfdr estimates at the values specified by the user in pvals
#         q - a vector of corresponding monotone fdr estimates, similar to Storey's q-value
#         cdf - vector of the SPLOSH cdf estimate evaluated at pvals
#         fp - vector of estimated no. of false positives incurred by setting threshold 
#              at corresponding value of pvals
#         fn - vector of estimated no. of false negatives incurred by setting threshold 
#              at corresponding value of pvals
#         toterr - vector giving the sum of fp and fn for each entry
#         ebp - the empirical Bayes posterior that the null hypothesis is true, see Efron et al. (JASA 2001)
#         pi - estimate of the mixing parameter giving the probability of the null hypothesis
#         ord - a vector giving index numbers to order results according to ascending p-values
#         dplaces - echoes value of dplaces used in algorithm
#         LC - echoes value of LC used in algorithm
#         fdr.method - gives bibliographic reference for the method
#         notes - returns notes regarding the analysis
###########################################################################################################

splosh0<-function(pvals,LC=loess.control(),dplaces=4,notes=NULL)
     
{
	notes<-c(notes,"SPLOSH used to estimate FDR.")
	x<-round(pvals,dplaces)                                      # Prevent numerical difficulties
	n<-length(pvals)                                             # Obtain no. of points
	r<-rank(x)                                                   # rank observations
	ux<-sort(unique(x))                                          # ordered unique p-values
	ur<-(sort(unique(r)))-.5                                     # ordered unique ranks
	if (max(ux)<1)                                               # edge adjustments
	{
		ux<-c(ux,1)
		ur<-c(ur,n)
	}
	if (min(ux)>0)
	{
		ux<-c(0,ux)
		ur<-c(0,ur)
	}  
	nux<-length(ux)                                              # No. of unique points
	ur<-ur/n                                                     # Divide ranks by n
	dx<-diff(ux)                                                 # spacings of unique p-values
	dr<-diff(ur)                                                 # spacings of unique ranks
	dFdx<-exp(log(dr)-log(dx))                                   # spacing-interval slopes
	mr<-(ur[1:(nux-1)]+ur[2:nux])/(2)                            # Unitized Medians of Unique Rank Intervals
	mx<-(ux[1:(nux-1)]+ux[2:nux])/2                              # Unitized Medians of Unique p-value Intervals
	tr<-asin(2*(mr-.5))                                          # Arc-Sine Transform Rank Midpoints
	fit<-loess(log(dFdx)~tr,loess.control=LC)                    # Lowess of log-transformed EDQF and transformed rank-midpoints
	tr2<-asin(2*(ur-.5))
	yhat<-(predict.loess(fit,tr2))                               # Obtain Estimated Derivative of Quantile Function
	yhat[c(1,nux)]<-approx(tr2[2:(nux-1)],
	                       yhat[2:(nux-1)],
	                       xout=tr2[c(1,nux)],
	                       rule=3)$y                             # Extrapolate to get endpoint estimates
	pdf1<-exp(yhat)                                              # Obtain PDF up to a constant
	trap<-0.5*(pdf1[1:(nux-1)]+pdf1[2:nux])*dx                   # Trapezoid rule terms
	const<-sum(trap)                                             # Constant via trapezoid rule
	pdf2<-pdf1/const                                             # Adjust pdf by constant
	pdf<-approx(ux,pdf2,xout=pvals,rule=3)$y                     # Obtain pdf estimate for p-values
	cdf<-approx(ux[2:nux],                                       
	            cumsum(trap),
	            xout=pvals,
	            rule=2)$y/const                                  # Obtain CDF 
	pi<-min(pdf)                                                 # Take pi as min of PDF
	p.pi<-max(pvals[pdf==pi])                                    # p-value at which pi occurs
	cfdr<-pi*pvals/cdf                                           # Estimate cFDR for p-values
	ebp<-pi/pdf                                                  # Estimate EBP for p-values
	cfdr[pvals==0]<-ebp[pvals==0]                                # L'Hospital's Rule
	cfdr[cfdr>1]<-1                                              # Set max(fdr)=1
	qval<-cfdr                                                   # Initialize q-value vector
	ordering<-order(pvals)                                       # Report Results
	qval[rev(ordering)]<-cummin(qval[rev(ordering)])             # Compute q-value
	fp<-n*cfdr*cdf
	fn<-n*(1-cdf-pi*(1-pvals))
	fn[pvals>=p.pi]<-0
	toterr<-fp+fn
	return(list(cfdr=matrix(cfdr,n,1),                 # cfdr estimate
	            q=matrix(qval,n,1),                    # q-value estimate
	            cdf=matrix(cdf,n,1),                   # cdf estimate
	            fp=matrix(fp,n,1),                     # estimate of no. of false postives
	            fn=matrix(fn,n,1),                     # estimate of no. of false negatives
	            toterr=matrix(toterr,n,1),             # estimated total no. of errors
	            pdf=matrix(pdf,n,1),                   # estimate of pdf
	            ebp=matrix(ebp,n,1),                   # estimate of ebp
	            pi=pi,                                 # estimate of null proportion
	            p.pi=p.pi,                             # p-value at which estimated pdf achieves minimum
	            ord=ordering,                          # ordering vector
	            dplaces=dplaces,                       # rounded to this no. of decimal places
	            LC=LC,                                 # loess.control options used
	            fdr.method="Pounds and Cheng (2004)",  # Bibliographic Information
	            notes=notes))                          # Notes regarding analysis
}