.packageName <- "GSEAlm"
#### This file actually has the lmpergene function
### also:

### pvals from permutation matrix and qRequire (copied over
### from Category package where they are hidden functions)

### GSEA permutation test for one main effect in a multiple regression setting
### (generalization of gseattperm)



#in this function we  compute the linear regressions
#on a pergene basis
#note the NA handling is still not great, as we don't yet
# deal with NA's in the ExpressionSet

lmPerGene <- function(eSet,formula="", na.rm = TRUE) {

  if (formula=="") {

### Intercept-Only Model (default)
      nSamp = ncol(eSet)
      x=matrix(rep(1,nSamp),nrow=nSamp,ncol=1)
      colnames(x) = "Intercept"
      eS = eSet


  } else {
    xvarnames = all.vars(formula)
  badvars = !(xvarnames %in% varLabels(eSet))
  if( any(badvars) )
      stop("variable", xvarnames[badvars],
          "are not variable names in the supplied ExpressionSet")

  nvar <- length(xvarnames)


##drop any with missing values in some covariate
  if( na.rm ) {
      na = rep(FALSE, ncol(eSet))
      for (i in xvarnames)
         na = na | is.na(eSet[[i]])
      eS = eSet[, !na]
  } else
      eS = eSet

    nSamp=ncol(eS)
  xvar <- pData(eS)[, xvarnames, drop=FALSE]

    x = model.matrix(formula, data = xvar)
}
  xx <- crossprod(x)
  xxinv <- solve(xx)
  Hmat <- x %*% xxinv %*% t(x)
  dMat = diag(nSamp) - Hmat
  k = ncol(x)

  xy = exprs(eS) %*% x
  res = exprs(eS) %*% dMat
  beta = solve(xx, t(xy))
  varr = rowSums( exprs(eS) * res)/(nSamp - k)

  varbeta = matrix(diag(xxinv), ncol=k, nrow=nrow(eS),
     byrow=TRUE)
  varbeta = apply(varbeta, 2, function(x) x* varr)
  colnames(varbeta) = colnames(x)

  return(list(eS=eS, x = x, Hmat = Hmat, formula=formula,coefficients = beta,
              sigmaSqr = var, coef.var = t(varbeta)))
}

##### GSEA inference for main effect using multiple regression and permutation
##### Inference reported only for main effect, which MUST BE
##### THE FIRST VARIABLE IN THE FORMULA
##### This is because label permutations are done within each block defined by
##### level combinations of the other variables

##### This is an extension of "gseattperm"

gsealmPerm=function (eSet,formula="",mat,nperm,na.rm=TRUE,...) {

### For the most part we rely on 'lmPerGene' for formula validation, NA removal, etc. etc.

nSamp=ncol(eSet)

if (formula=="") {
        nvar=0
} else {
    xvarnames = all.vars(formula)
    nvar <- length(xvarnames)
}

### The observed t-values for the main effect

obsRaw=lmPerGene(eSet=eSet,formula=formula,na.rm=na.rm)

if (nvar>0) {
    observedStats= GSNormalize(obsRaw$coefficients[2,]/sqrt(obsRaw$coef.var[2,]),incidence=mat,...)
} else {
    observedStats= GSNormalize(t(obsRaw$coefficients),incidence=mat,fun2=identity,...)
}

### Permutation loop; we do the intercept-only case separately below

perm.eset=eSet

i <- 1L
if (nvar>0) {
    permMat <- matrix(0, nrow = nrow(eSet), ncol = nperm)

    while (i < (nperm + 1)) {

#### The crux (with nvar>=2): label permutation is done *within each covariate-combination level group separately*

        if (nvar>=2) {
            splitter=pData(eSet)[,xvarnames[2]]
            if (nvar>2) splitter=as.list(pData(eSet)[,xvarnames[2:nvar]])

            label.perm=unsplit(lapply(split(1:nSamp,splitter),sample),splitter)

### Now we only permute the labels of variable 1
            pData(perm.eset)[,xvarnames[1]]<-pData(eSet)[label.perm,xvarnames[1]]
        } else if (nvar==1) {
            pData(perm.eset)[,xvarnames[1]]<-pData(eSet)[sample(1:nSamp),xvarnames[1]]
        }
        temp.results<-lmPerGene(eSet=perm.eset,formula=formula,na.rm=na.rm)
# (na.rm=FALSE since we already dealt with na's)

### record t-score for permuted variable
        permMat[, i] <- temp.results$coefficients[2,]/sqrt(temp.results$coef.var[2,])

        i <- i + 1L
  }
  permMat <- GSNormalize(permMat,incidence=mat,...)
  rownames(permMat)=rownames(mat)

} else if (nvar==0) {

### Intercept only - row permutation and using raw expression means, no need for repeated calls to 'lmPerGene'

    permMat <- matrix(0, nrow = nrow(mat), ncol = nperm)
    rownames(permMat)=rownames(mat)

    for (i in 1:nperm)  permMat[,i]=GSNormalize(t(obsRaw$coefficients),incidence=mat[,sample(1:ncol(mat))],fun2=identity,...)
}
  return(pvalFromPermMat(observedStats, permMat))
}


#### unexported functions

pvalFromPermMat <- function(obs, perms) {
    N <- ncol(perms)
    pvals <- matrix(as.double(NA), nr=nrow(perms), ncol=2)
    dimnames(pvals) <- list(rownames(perms), c("Lower", "Upper"))

    tempObs <- rep(obs, ncol(perms))
    dim(tempObs) <- dim(perms)
    pvals[ , 1] <- rowSums(perms <= tempObs)/N
    pvals[ , 2] <- rowSums(perms >= tempObs)/N
    pvals
}

### plotting functions for GSEA diagnostics





####################3
# plotting functions: no error check here
####################3

### Gene-set indiviudal residual Boxplots by sample, by factor level -
### for 2-level factors 
### Each point is a single residual from a single gene and sample

resplot=function(GSname="All", resmat,incidence=dumminc(resmat), fac,
atomic="Gene",core.text="Residuals by Sample", 
yname="Standardized Residual", xname="Sample ID", ID=colnames(resmat),
lims=0,gnames=levels(fac), prefix="", horiz=FALSE,
colour=5,pch='+',...)  {

setsize=sum(incidence[GSname,]>0)
atomics=paste(atomic,"s",sep="")
k=nlevels(fac)

lengths=table(fac)
tnames=levels(fac)

layout(1:k)
if (horiz) layout(t(1:k))

if (length(lims)==1) lims=range(resmat[incidence[GSname,]>0,])
lass=ifelse(horiz,1,3)
if(horiz) {
            timp=xname
            xname=yname
            yname=timp }

for (a in 1:k) {
    title1=paste(prefix,GSname,gnames[a],core.text, "(",setsize,atomics,")")
    boxplot(split(resmat[incidence[GSname,]>0,fac==tnames[a]],col(resmat[incidence[GSname,]>0,fac==tnames[a]])),main=title1,cex=.7,cex.axis=.8,names=ID[fac==tnames[a]],las=lass,pch=pch,range=1,ylim=lims,xlab=xname,ylab=yname,boxwex=0.8*min(1,log(lengths[a])/log(mean(lengths))),horizontal=horiz,col=colour,...)
    if (horiz) { lines(rep(0,2),c(-5,length(ID)),col=2)
        } else lines(c(-5,length(ID)),rep(0,2),col=2)
}
}

### Same, but stripcharts for smaller-size gene sets

restrip=function(GSname="All", resmat, incidence=dumminc(resmat), fac,
atomic="Gene", core.text="Residuals by Sample",
yname="Standardized Residual", xname="Sample ID", ID=colnames(resmat),
gnames=levels(fac), prefix="", colour=c(2:4,6), resort=TRUE,
horiz=FALSE, resort.fun=num.positive, pch='+',...) {

vert=!horiz    
setsize=sum(incidence[GSname,]>0)
myraw=resmat[incidence[GSname,]>0,]
atomics=paste(atomic,"s",sep="")


if (resort) {
   colsort=sort(apply(myraw,2,resort.fun),index=T)
   myraw=myraw[,colsort$ix]
   fac=fac[colsort$ix]
   ID=ID[colsort$ix]
}

k=nlevels(fac)
lengths=table(fac)
tnames=levels(fac)

layout(1:k)
if (!vert) layout(t(1:k))

#if (lims==0) lims=range(resmat[incidence[GSname,]>0,])
lass=ifelse(!vert,1,3)
if(!vert) {
            timp=xname
            xname=yname
            yname=timp }

for (a in 1:k) {
    title1=paste(prefix,GSname,gnames[a],core.text, "(",setsize,atomics,")")

    stripchart(split(myraw[,fac==tnames[a]],col(myraw[,fac==tnames[a]])),cex=.8,cex.axis=.8,group.names=ID[fac==tnames[a]],xlab=xname,ylab=yname,las=lass,pch=pch,vert=vert,col=colour,...)
    title(title1)
    if (vert) { lines(c(-5,length(ID)),rep(0,2))
     } else lines(rep(0,2),c(-5,length(ID)))
}

}

############# An upgrade (or downgrade?) to the MNplot

mnDiffPlot=function(GSname="All", exprmat, incidence=dumminc(exprmat),
fac, atomic="Gene", core.text=paste("Mean Expression Difference by",
                    atomic),
yname="Log Expression Ratio", xname="Log Expression",
gnames=levels(fac), prefix="", fitline=FALSE, varsize=FALSE,
reverse=FALSE, ...) {

layout(1)
lengths=table(fac)
tnames=levels(fac)
setsize=sum(incidence[GSname,]>0)
atomics=paste(atomic,"s",sep="")

if (reverse) {
    lengths=rev(lengths)
    gnames=rev(gnames)
    tnames=rev(tnames)
}
title0=paste(prefix,GSname,core.text, "(",setsize,atomics,")")
title2=paste(yname,"(",gnames[2],"/",gnames[1],")")

xvals=apply(exprmat[incidence[GSname,]>0,fac==tnames[1]],1,mean)
yvals=apply(exprmat[incidence[GSname,]>0,fac==tnames[2]],1,mean)-xvals

if (varsize) {
    xerrs=apply(exprmat[incidence[GSname,]>0,fac==tnames[1]],1,var)/lengths[1]
    yerrs=apply(exprmat[incidence[GSname,]>0,fac==tnames[2]],1,var)/lengths[2]+xerrs
}
sizes=1
if(varsize) sizes=sqrt(yerrs/mean(yerrs))
ylims=range(yvals,na.rm=TRUE)
if(ylims[1]>0) ylims[1]=0
if(ylims[2]<0) ylims[2]=0

plot(xvals,yvals,main=title0,xlab=paste (gnames[1],":",xname),ylab=title2,cex=sizes,ylim=ylims,...)
abline(0,0,col=2)
if (fitline) lines(sort(xvals),predict(loess(yvals~xvals),newdata=sort(xvals)),col=4)
}


####################### unexported utilities

### dummy incidence matrix for getting all resids

dumminc=function(resmat) {

myout=matrix(1,nrow=1,ncol=nrow(resmat))
rownames(myout)="All"
myout}

num.positive=function(x) sum(x>0)
num.extreme=function(x,lo,hi) sum(x<lo | x>hi)
num.low=function(x,lo) sum(x<lo)
num.high=function(x,hi) sum(x>hi)

### This file has utilities related to incidence matrices


### This function does the normalized aggregation over gene-sets;
### It is now hopefully generic enough to enable any GSEA flavor

GSNormalize<-function(dataset,incidence,gseaFun=crossprod,fun1="/",fun2=sqrt,removeShift=FALSE,removeStat=mean,...) {

    dataset=as.matrix(dataset)

### Removal of column-wise mean shift

    if (removeShift) {

        colStats=apply(dataset,2,removeStat)
        dataset=sweep(dataset,2,STATS=colStats)
    }

    if (ncol(incidence) != nrow(dataset)) stop ("GSNormalize: non-conforming matrices")


    outm=gseaFun(t(incidence),dataset,...)


    rownames(outm)=rownames(incidence)
    colnames(outm)=colnames(dataset)

    normby = fun2(rowSums(incidence))

    outm = sweep (outm,1,normby,FUN=fun1)

}



identity<-function(x) x
one <- function(x) 1
#### This file has mostly diagnostic functions

### get residuals
### Cook's D
### DFBETAS, DFFITS
### Leverage (hat-mat diagonal)

### All have been vector-matrixized now for quick implementation

dffitsPerGene <- function(lmobj) {
  obj <- lmobj
  n <- ncol(exprs(obj$eS))
  p <- ncol(obj$x)
  e <- tcrossprod(diag(n) - obj$Hmat,exprs(obj$eS))
  SSE <- sapply(featureNames(obj$eS), function(i) { exprs(obj$eS)[i,] %*% e[,i] } )
  hii <- diag(obj$Hmat)

  se <- sapply(colnames(e),function(i) {
    sqrt((SSE[i] * (1-hii)- e[,i] * e[,i])/ (n-p-1))
  } )
  t <- e/se
  dffits <- apply(t,2,function(u) {
    u * sqrt(hii / (1-hii))
  } )
  dffits=t(dffits)

  return(dffits)
}


CooksDPerGene <- function(lmobj) {
     rs = getResidPerGene(lmobj,type="intStudent")
### residuals have to be internally Studentized for the shortcut formula to be correct
     p = ncol(lmobj$x)

     D = exprs(rs)^2
     sweep(D, 2, diag(lmobj$Hmat)/(p*(1-diag(lmobj$Hmat))), "*")
 }


dfbetasPerGene<- function(lmobj) {
### Shortcut formula from Jensen and Ramirez
  obj <- lmobj
  xx <- crossprod(obj$x)
  xxinv <- solve(xx)
  n <- ncol(exprs(obj$eS))
  p <- ncol(obj$x)

  Db <- array(0,dim=c(nrow(exprs(obj$eS)),n,p),dimnames=list(featureNames(obj$eS),sampleNames(obj$eS),rownames(obj$coefficients)))

  tees=exprs(getResidPerGene(lmobj))
  oneMinusH=1-Leverage(lmobj)
  xxx=tcrossprod(xxinv,obj$x)

  for (k in 1:p) {

      Db[,,k] = t(t(tees)*xxx[k,]/sqrt(xxinv[k,k]*oneMinusH))
  }

Db
}

getResidPerGene <-  function(lmobj, type="extStudent") {
### Residuals from a per-gene linear model
### Uses matrix algebra to make a faster calculation than doing them one by one
### Accepted types: response, normalized, internally Studentized (a.k.a. standardized) and externally Studentized (default)

    nSamp = ncol(lmobj$eS)
  dMat = diag(nSamp) - lmobj$Hmat
  e = crossprod(t(exprs(lmobj$eS)),dMat)


  p <- ncol(lmobj$x)

### The code (perhaps primitively) proceeds by successive adjustments needed to change the residual type, with a possible exit at each step:

if (type=="response")   return(new("ExpressionSet",exprs=e,phenoData=phenoData(lmobj$eS)))

### The following is a simple normalization by gene-specific res.std.err.:

  sigma = sqrt(rowSums(e^2)/(nSamp-p))

  e=e/sigma

if (type=="normalized")   return(new("ExpressionSet",exprs=e,phenoData=phenoData(lmobj$eS)))

###  Internal Studentization
stud.e = sweep(e, 2, sqrt(1-diag(lmobj$Hmat)), "/")

if (type=="intStudent" || type=="standardized")   return(new("ExpressionSet",exprs=stud.e,phenoData=phenoData(lmobj$eS)))

#### External Studentization (the default)

stud.e=stud.e*sqrt((nSamp-p-1)/(nSamp-p-stud.e^2))

  return(new("ExpressionSet",exprs=stud.e,phenoData=phenoData(lmobj$eS)))
}

Leverage <- function(lmobj) {
  Hmat <- lmobj$Hmat
  ans <- diag(Hmat)
  return(ans)
}
