.packageName <- "gpls"
"h" <-
  function(x,family="normal",link="identity") { 
# g-inverse, i.e. inverse funtion of link function
    if (family=="normal"){
      if (link=="identity") x
      else cat("link function not recognized for ",family, " choose from identity!\n")
    }
    else if (family=="binomial") {
      if (link=="logit")
        {
          ifelse(is.infinite(exp(x)),1,exp(x)/(1+exp(x)))
        
        }
      else if (link=="probit") pnorm(x)
      else if (link=="cloglog") 1-exp(-exp(x))
      else cat("link function not recognized for ",family, " choose from logit,probit and cloglog!\n")
    }
    else if (family=="poisson") {
      if (link=="log") exp(x)
      else cat("link function not recognized for ", family, " chosse from log!\n" )
    }
    else if (family=="gamma"){
      if (link=="reciprocal") 1/x
      else cat("link function not recognized for ",family, " choose from reciprocal!\n")
    }
  }


"hp" <-
function(x,family="normal",link="identity") { 
# first derivative of h wrt eta, the linear predictor

if (family=="normal"){
if (link=="identity") rep(1,length(x))
else cat("link function not recognized for ",family, " choose from identity!\n")}
else if (family=="binomial") {
if (link=="logit") exp(x)/(1+exp(x))^2
else if (link=="probit") dnorm(x)
else if (link=="cloglog") exp(-exp(x))*exp(x)
else cat("link function not recognized for ",family, " choose from logit,probit and cloglog!\n")
}
else if (family=="poisson") {
if (link=="log") exp(x)
else cat("link function not recognized for ", family, " chosse from log!\n" )
}
}

"g" <-
function(x,family="normal",link="identity") {
# mu-link function for generalized linear model
if (family=="normal"){
if (link=="identity") x
else cat("link function not recognized for ",family, " choose from identity!\n")}
if (family=="binomial") {
 if (link=="logit") log(x/(1-x))
 else if (link=="probit") qnorm(x)
 else if (link=="cloglog") log(-log(1-x))
 else cat("link function not recognized for ",family, " choose from logit,probitand cloglog!\n")
}
else if (family=="poisson") {
 if (link=="log") log(x)
 else cat("link function not recognized for ", family, " choose from log!\n" )
}
}

"bpp" <-
function(x,family="normal",link="identity") { 
# variance function
if (family=="normal"){
if (link=="identity") rep(1,length(x))
else cat("link function not recognized for ",family, " choose from identity!\n")}
else if (family=="binomial") {
if (link=="logit") exp(x)/(1+exp(x))^2
else if (link=="probit") pnorm(x)*(1-pnorm(x))
else if (link=="cloglog") exp(-exp(x))*(1-exp(-exp(x)))
else cat("link function not recognized for ",family, " choose from logit,probit and cloglog!\n")
}
else if (family=="poisson") {
if (link=="log") exp(x)
else cat("link function not recognized for ", family, " chosse from log!\n" )
}
}

"psi" <-
function(x,family="normal") {
# initial value for dependent variable
if (family=="binomial") return((x+0.5)/2)
else if (family=="poisson") return(log(x+0.5))
else return(x)}

"probcal" <- function(y,eta)
{ # return P(Y=y), now only works for logit link for binomial family
  p <- 0
  for ( i in 1:length(y))
  p[i] <- ifelse(y[i]==1, exp(eta[i])/(1+exp(eta[i])) ,1/(1+exp(eta[i])))
  p
}

"glpls1a" <-
function(X,y,K.prov=NULL,eps=1e-3,lmax=100,b.ini=NULL,denom.eps=1e-20,family="binomial",link=NULL,br=T)
{

  if (is.null(link)) {
    if (family=="normal") link <- "identity"
    else if (family=="binomial") link <- "logit"
    else if (family=="poisson") link <- "log"
  }
  
  X <- as.matrix(X)
  dx <- dim(X)

### number of PLS components
  
  if (is.null(K.prov)) K <- min(dx[1]-1,dx[2])
  else if ( K.prov > min(dx[1]-1,dx[2]) )
    {
      cat("number of dimension K.provd exceeds rank of X. \n","Provide a number that is less than or equal to ",min(dx[1]-1,dx[2]),"!\n");
      K <- min(dx[1]-1,dx[2])
    }
  else
    {
      K <- K.prov
    }

  ## cat("Number of components is:", K,"!\n")

### initialzing matrices for PLS regression

  ## weight matrix for PLS regression
  W<- matrix(0,dx[2],K)
  
  ## score matrix
  Ta <- matrix(0,dx[1],K)

  ## loading matrix for X
  P <- matrix(0,dx[2],K)

  ## loading matrix for y
  Q <- numeric(K)
  
### number of iterations
  l <- 0
  
### intial values

  ## convergence
  converged <- F
  
  ## initial predictor matrix
  E <- X

  ## pseudo response
  ystar <- psi(y,family)
  ystar0 <- ystar 

  ## Weight matrix for GLM, e.g. p(1-p) for logistic regression
  V <- diag(c(hp(ystar,family,link)^2/bpp(ystar,family,link)))

  ## weighted standardization of predictor matrix
  E <- t(E)-apply(E,2,weighted.mean,diag(V))
  E <- t(E)
#  E <- t(E/sqrt(apply(E,1,var)))

  ## linear predictor, ie. eta = Xb
  eta <- rep(0,length(y))
  eta.old <- eta+10

  Q <- rep(0,K)
  Q.old <- rep(100,K)
  K.old <- K
  min <- 1000
  
  ## regression coefficients
  if(!is.null(b.ini))
    {
      beta.old <- b.ini[-1]
    }
  else{
    beta <- rep(1,ncol(X))
  }
  
  beta.old <- beta/1000 

  while ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps) & (l < lmax))
    {
  #    cat("Iter:",l,"\n")

      if ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) < min))
        {
          if(l==1)
            {
              W.min <- W
              P.min <- P
             Q.min <- Q
             }
          else
            {
              ## record minimum values
              min <- max(abs(beta-beta.old)/(abs(beta.old)+denom.eps))
              W.min <- W
              P.min <- P
              Q.min <- Q
              eta.min <- eta
              l.min <- l
            }
        }
      
      K.old <- K
     
      l <- l + 1
        
   #   K <- min(dx[1]-1,dx[2],Rank(E),K.prov)
      W <- matrix(0,dx[2],K)
      Ta <- matrix(0,dx[1],K)
      P <- matrix(0,dx[2],K)
      Q <- numeric(K)
      
### PLS regression
      
      for ( i in 1:K) {
        w <- t(E)%*%V%*%ystar
        w <- w/sqrt(crossprod(w)[1])
        W[,i] <- w
        ta <- E%*%w
        ## ta <- ta-weighted.mean(ta,diag(V))
        ## ta <- ta/sqrt(var(ta))[1]
        Ta[,i] <- ta
        taa <- (t(ta)%*%V%*%ta)[1]
        p <- t(E)%*%V%*%ta/taa
        P[,i] <- p
        q <- (t(ystar)%*%V%*%ta/taa)[1]
        Q[i] <- q
        ystar <- ystar - q*ta
        E <- E - ta%*%t(p)
      }

      ## update beta
      beta.old <- beta  
      beta <- W%*%solve(t(P)%*%W)%*%Q

      ## Hat matrix
      H <- hat(sweep(cbind(rep(1,dx[1]),X),1,sqrt(diag(V)),"*"),intercept=FALSE)
      ## update eta (linear predictor)
      eta <- weighted.mean(ystar0,diag(V)) + Ta%*%Q

      ## update and rescaling weight matrix
      V <- diag(c(hp(eta,family,link)^2/bpp(eta,family,link)))
      V <- V*(H*br+1)
      
      ## diagnosis for divergence
      if( sum(diag(V)=="NaN") > 0) 
        { 
          #print("diagonal elements of V overflow!") 
           break
        }
      if (sum(round(probcal(y,eta),4)>=0.9999)==length(y)) 
        { 
           #print("complete separation !")
           break 
        }

      ## update pseudo response
     ystar <- eta + diag(c(1/hp(eta,family,link)))%*%(y+H*br/2-(H*br+1)*h(eta,family,link))/(H*br +1)
      ystar0 <- ystar

      ## update predictor matrix
      E <- t(X)-apply(X,2,weighted.mean,diag(V))
      E <- t(E)
      ## E <- t(E/sqrt(apply(E,1,var)))
           
#         print(max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)))
    }

#  print(l)

  if (max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps)
    {
 #     cat("Convergence Not achieved and estimate from iteration ", l.min, " is used!")
        W <- W.min
        P <- P.min
        Q <- Q.min
        eta <- eta.min
    }
  else
    {
      converged <- T
    }

  ## final estimates
  beta <- W%*%solve(t(P)%*%W)%*%Q
  beta0 <- eta[1]-X[1,]%*%beta
  
  list(coefficients=c(beta0,beta),
                convergence = converged,
                niter = l,
                bias.reduction = br)
}
glpls1a.train.test.error <- function(train.X,train.y,test.X,test.y,K.prov=NULL,eps=1e-3,lmax=100,family="binomial",link="logit",br=T)
{
  ## calculate test set error by out-of-sample counting
  ## the input predictor matrices are p by n

  train.X <- t(train.X)
  test.X <- t(test.X)

  if(!is.null(rownames(train.X)) & !is.null(rownames(test.X)))
     {
        test.X <- test.X[match(row.names(train.X),row.names(test.X)),]
     }

  train.mean <- rowMeans(train.X)
  train.sd <- apply(train.X,1,sd)
  train.X <- scale(t(train.X))

  test.X <- t((test.X-train.mean)/train.sd)

  train.fit <- glpls1a(train.X,train.y,K.prov=K.prov,eps=eps,lmax=lmax,family=family,link=link,br=br)
  test.y.predict <- glpls1a.predict(test.X,train.fit$coefficients,family=family,link=link)
  error.test <- 1-sum(test.y.predict>0.5==test.y)/length(test.y)
  error.obs <- seq(1:length(test.y))[test.y.predict>0.5!=test.y]
  return(list(error=error.test, error.obs = error.obs,predict.test=test.y.predict))

}

glpls1a.cv.error <- function(train.X,train.y,K.prov=NULL,eps=1e-3,lmax=100,family="binomial",link="logit",br=T)
  {
    ## calculate error rate by leaving-one-out CV of training set
    ## the input train.X is p by n

    train.y.predict <- NULL
    for( i in 1:nrow(train.X))
      {
    #    print(i)
        train.fit <- glpls1a(train.X[-i,],train.y[-i],K.prov=K.prov,eps=eps,lmax=lmax,family=family,link=link,br=br)
        train.y.predict[i] <- glpls1a.predict(matrix(train.X[i,],nrow=1),train.fit$coefficients)
      }

    error.CV <- 1-sum(train.y.predict>0.5==train.y)/length(train.y)
    error.obs <- seq(1:length(train.y))[train.y.predict>0.5!=train.y]
    return(list(error=error.CV,error.obs=error.obs))
  }

glpls1a.error <- function(train.X,train.y,K.prov=NULL,eps=1e-3,lmax=100,family="binomial",link="logit",br=T)
  {
    ## calculate error rate by leaving-one-out CV of training set
    ## the input train.X is p by n

    train.y.predict <- NULL
    train.fit <- glpls1a(train.X,train.y,K.prov=K.prov,eps=eps,lmax=lmax,family=family,link=link,br=br)
    train.y.predict <- glpls1a.predict(train.X,train.fit$coefficients)
    error <- 1-sum(train.y.predict>0.5==train.y)/length(train.y)
    error.obs <- seq(1:length(train.y))[train.y.predict>0.5!=train.y]
    return(list(error=error,error.obs=error.obs))
  }


glpls1a.mlogit.cv.error <- function(train.X,train.y,K.prov=NULL,eps=1e-3,lmax=100,mlogit=T,br=T)
  {
    ## calculate CV error rate for multinomial logit model
    ## calculate error rate by leaving-one-out CV of training set
    ## the input train.X is p by n
    ## the input train.y is the multinomial outcome, within level 1 being the most frequent

    family <- "binomial"
    link <- "logit"

    n <- nrow(train.X)

    C <- max(train.y)
    train.y.predict <- matrix(0,nrow=n,ncol=C-1)

    if(is.null(K.prov))
      {
        dx <- dim(train.X)
        K.prov <- min(dx[1],dx[2]-1)
      }


    for( i in 1:n)
      {
    #   print(i)
       if(mlogit)
         {
           train.fit <- glpls1a.mlogit(train.X[-i,],train.y[-i],K.prov=K.prov,eps=eps,lmax=lmax,br=br)
         }
       else
         {
            train.fit <- glpls1a.logit.all(train.X[-i,],train.y[-i],K.prov=K.prov,eps=eps,lmax=lmax,br=br)
          }

       train.y.predict[i,] <- glpls1a.mlogit.predict(matrix(train.X[i,],nrow=1),train.fit$coef)
      }

    temp <- t(apply(train.y.predict,1,function(x) x==max(x,1-sum(x))))
    temp <- apply(cbind(apply(temp,1,function(x) !any(x)),temp),1,function(x) (1:C)[x])
    error.CV <- 1-sum(temp==train.y)/length(train.y)
    error.obs <- seq(1:length(train.y))[temp != train.y]
    return(list(error=error.CV,error.obs=error.obs))
  }
glpls1a.logit.all <- function(X,y,K.prov=NULL,eps=1e-3,lmax=100,b.ini=NULL,denom.eps=1e-20,family="binomial",link="logit",br=T)
{
  family <-"binomial"
  link <- "logit"

  x <- as.matrix(X)
  yv <- as.numeric(as.vector(y))
  dimnames(x) <- names(yv) <- NULL
  C <- max(yv)-1

  n <- nrow(x)
  r <- ncol(x)+1
  
  y <- matrix(0,n,C+1)
  y[cbind(seq(n),yv)] <- 1
  y0 <- y[,1]
  y <- y[,-1,drop=F]

  beta <- matrix(0,r,C)

  for ( i in 1:C)
    {
      index <- (1:n)[y0 != y[,i]]
      X <- x[index,]
      Y <- as.vector(y[index,i])
      beta[,i] <- glpls1a(X,Y,K.prov=K.prov,eps=eps,lmax=lmax,b.ini=b.ini,denom.eps=denom.eps,family=family,link=link,br=br)$coef
    }
  return(list(coefficients=beta))
}
glpls1a.mlogit <- function(x,y,K.prov=NULL,eps=1e-3,lmax=100,b.ini=NULL,denom.eps=1e-20,family="binomial",link="logit",br=T)
{

  if(family != "binomial" | link != "logit")
    {
      print("wrong family (link) !\n")
      break
    }

  if(any(x[,1] !=1))
    x <- cbind(rep(1,nrow(x)),x)
  
  x <- as.matrix(x)
  yv <- as.numeric(as.vector(y))
  dimnames(x) <- names(yv) <- NULL
  r <- ncol(x)
  C <- max(yv)-1

  n <- nrow(x)
  y <- matrix(0,n,C+1)
  y[cbind(seq(n),yv)] <- 1
  y0 <- y[,1]
  y <- y[,-1]
  jn <- rep(1,n)
  jC <- rep(1,C)
  y2 <- as.vector(t(y))
  x2 <- t(kronecker(rep(1,C),x))
  dim(x2) <- c(r,n,C)
  x2 <- aperm(x2,c(1,3,2))
  dim(x2) <- c(r,n*C)
  x2 <- t(x2)

  X <- matrix(0,nrow=C*n,ncol=C*r)
  for ( i in 1:n)
    {
      for (j in 1:C)
        {
          X[((i-1)*C+j),(((j-1)*r+1):((j-1)*r+r))] <- x[i,]
        }
    }
  
  dx <- dim(X)
  
  if (is.null(K.prov)) K <- min(dx[1]-1,dx[2])
  else if ( K.prov > min(dx[1]-1,dx[2]) )
    {
      cat("number of dimension K.provd exceeds rank of X. \n","Provide a number that is less than or equal to ",min(dx[1]-1,dx[2]),"!\n"); break
    }
  else
    {
      K <- K.prov
    }

#  cat("Number of components is:", K,"!\n")

### initialzing matrices for PLS regression

  ## weight matrix for PLS regression
  W<- matrix(0,dx[2],K)

  ## score matrix
  Ta <- matrix(0,dx[1],K)

  ## loading matrix for X
  P <- matrix(0,dx[2],K)

  ## loading matrix for y
  Q <- numeric(K)


### intial values

  ## convergence
  converged <- F
  
  ## initial predictor matrix
  E <- X

  ## pseudo response
  ystar <- psi(y2,family)
  ystar0 <- ystar 

  ## Weight matrix for GLM
  V <- diag(abs(rnorm(C*n)),C*n)

  ## linear predictor, ie. eta = Xb
  eta.old <- rep(1,length(y2))
  eta <- eta.old+100

  Q <- rep(0,K)
  Q.old <- rep(100,K)
  K.old <- K
  min <- 1000
  
  ## regression coefficients
  if(!is.null(b.ini))
    {
      beta <- b.ini
    }
  else{
    beta <- rep(1,C*r)
  }

  beta.old <- beta/1000

  ## iteration index
  l <- 0
  l.min <- 1

  while ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps) & (l < lmax))
    {
  #    cat("Iter:",l,"\n")

      if ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) < min))
      {
          if(l==1)
            {
              W.min <- W
              P.min <- P
              Q.min <- Q
             }
           else
            {
       
          ## record minimum values
          min <- max(abs(beta-beta.old)/(abs(beta.old)+denom.eps))
          W.min <- W
          P.min <- P
          Q.min <- Q
          eta.min <- eta
          l.min <- l
        }
        }


      K.old <- K
     
      l <- l + 1

   #   K <- min(dx[1]-1,dx[2],Rank(E),K.prov)
      W <- matrix(0,dx[2],K)
      Ta <- matrix(0,dx[1],K)
      P <- matrix(0,dx[2],K)
      Q <- numeric(K)

       
      ### PLS regression
              
      for ( i in 1:K) {
        w <- t(E)%*%V%*%ystar
        w <- w/sqrt(crossprod(w)[1])
        W[,i] <- w
        ta <- E%*%w
        ## ta <- ta-weighted.mean(ta,diag(V))
        ## ta <- ta/sqrt(var(ta))[1]
        Ta[,i] <- ta
        taa <- (t(ta)%*%V%*%ta)[1]
        p <- t(E)%*%V%*%ta/taa
        P[,i] <- p
        q <- (t(ystar)%*%V%*%ta/taa)[1]
        Q[i] <- q
        ystar <- ystar - q*ta
        E <- E - ta%*%t(p)
       
      }
       
      
       ## update beta
    
      temp <- ginv(t(P)%*%W)
      if(any(is.na(temp)))
        {
          #cat("t(P)%*%W singular!\n")
          break
        }
      else
        {
          beta.old <- beta
          beta <- W%*%temp%*%Q
        }
    #   beta <- W%*%ginv(t(P)%*%W)%*%Q  

      ## Hat matrix
      H <- V%*%X%*%ginv(t(X)%*%V%*%X)%*%t(X)

      ## update eta (linear predictor)

      eta <- as.vector(t(x%*%matrix(beta,ncol=C,byrow=F)))
      p <-exp(x%*%matrix(beta,ncol=C ,byrow=F))
     
      den.p <- 1+as.vector(p%*%jC)
      
       if(any(is.infinite(den.p)))
        {
          #cat("Infinite denom for p!\n")
          break
        }
      p <- p2 <- p/den.p
      dim(p2) <- c(n,C)
      p2 <- as.vector(t(p2))

      ## update and rescaling weight matrix
      V <- matrix(0, C*n, C*n)
      for(j in seq(n)) {
        
        V[((j - 1) * C + 1):(j * C), ((j - 1) * C + 1):(j * C)] <- diag(p[j,],length(p[j,]))-matrix(p[j,],nrow=C,ncol=1)%*%matrix(p[j,],nrow=1,ncol=C)

      }
    
       ## diagnosis for divergence
      
    ## d_eta/d_mu matrix
      detadmu <- V
     	 for ( j in seq(n))
           {
             for (i in ((j - 1) * C + 1):(j * C))
               for (k in (i:(j*C)))
                 {
                   sum <- sum(p2[((j-1)*C+1):(j*C)])
                   if(i==k)
                     {
                     
                       detadmu[i,k] <- (1-sum+p2[k])/(p2[k]*(1-sum))
                         
                     }
                   else
                     {
                       detadmu[i,k] <- 1/(1-sum)
                       detadmu[k,i] <- detadmu[i,k]
                     }

                 }
           }
      
      Hw <- NULL
      for ( j in seq(n))
        {
          sum <- sum(diag(H)[((j - 1) * C + 1):(j * C)])
          for (i in ((j - 1) * C + 1):(j * C)) 
            {
              Hw[i] <- sum + diag(H)[i]
            }
        }
      
    ## update pseudo response
      if(any(is.na(detadmu)))
        {
          #cat("V singular\n")
          break
        }
     
      ystar <- eta + detadmu%*%(y2+diag(H)*br/2-(Hw*br/2+1)*p2)/(Hw*br/2+1)
      ystar0 <- ystar
      V <- V*(Hw*br/2+1)

      ## update predictor matrix
      E <- X
     
#      print(max(abs(beta-beta.old)/abs(beta.old)))
    }

 # print(l)
  if (max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps)
    {
      
   #   cat("Convergence Not achieved and estimate from iteration ", l.min, " is used!\n")
      W <- W.min
      P <- P.min
      Q <- Q.min
    }
  else
    {
      converged <- T
    }
  ## final estimates
  beta <- W%*%ginv(t(P)%*%W)%*%Q

return( list(coefficients=matrix(beta,ncol=C ,byrow=F),     
              convergence = converged,
              niter = l,bias.reduction =br))
       
}
glpls1a.predict <- function(X,beta,family="binomial",link="logit")
{
  if(all(X[,1] == rep(1,nrow(X))))
   {
     eta <- X%*%beta
   }else{
     eta <- cbind(rep(1,nrow(X)),X)%*%beta
  }
  return(h(eta,family,link))
}

glpls1a.mlogit.predict <- function(X,beta)
  {
    ## prediction for multinomial logit model
    ## beta is the p by J-1 matrix where J is the levels of categories of the outcomoe
    
    if(all(X[,1] == rep(1,nrow(X))))
      {
        eta <- X%*%beta
      }else
      {
        eta <- cbind(rep(1,nrow(X)),X)%*% beta
      }

    p <- exp(eta)
    p.denom <- apply(p,1,function(x) 1+sum(x))
      
    return(p/p.denom)
  }
ginv <- function (X, tol = sqrt(.Machine$double.eps))
{
    if (length(dim(X)) > 2 || !(is.numeric(X) || is.complex(X)))
        stop("X must be a numeric or complex matrix")
    if (!is.matrix(X))
        X <- as.matrix(X)
    if(any(is.na(X))) return(NA)
    Xsvd <- svd(X)
    if (is.complex(X))
        Xsvd$u <- Conj(Xsvd$u)
    Positive <- Xsvd$d > max(tol * Xsvd$d[1], 0)
    if(any(is.na(Positive)))
        return(NA)
    else{
	if (all(Positive))
        Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u))
    else if (!any(Positive))
        array(0, dim(X)[2:1])
    else Xsvd$v[, Positive, drop = FALSE] %*% ((1/Xsvd$d[Positive]) *
        t(Xsvd$u[, Positive, drop = FALSE]))
    }
}

