.packageName <- "gpls"
"h" <-
  function(x,family="normal",link="identity") { 
# g-inverse, i.e. inverse funtion of link function
    if (family=="normal"){
      if (link=="identity") x
      else cat("link function not recognized for ",family, " choose from identity!\n")
    }
    else if (family=="binomial") {
      if (link=="logit")
        {
          ifelse(is.infinite(exp(x)),1,exp(x)/(1+exp(x)))
        
        }
      else if (link=="probit") pnorm(x)
      else if (link=="cloglog") 1-exp(-exp(x))
      else cat("link function not recognized for ",family, " choose from logit,probit and cloglog!\n")
    }
    else if (family=="poisson") {
      if (link=="log") exp(x)
      else cat("link function not recognized for ", family, " chosse from log!\n" )
    }
    else if (family=="gamma"){
      if (link=="reciprocal") 1/x
      else cat("link function not recognized for ",family, " choose from reciprocal!\n")
    }
  }


"hp" <-
function(x,family="normal",link="identity") { 
# first derivative of h wrt eta, the linear predictor

if (family=="normal"){
if (link=="identity") rep(1,length(x))
else cat("link function not recognized for ",family, " choose from identity!\n")}
else if (family=="binomial") {
if (link=="logit") exp(x)/(1+exp(x))^2
else if (link=="probit") dnorm(x)
else if (link=="cloglog") exp(-exp(x))*exp(x)
else cat("link function not recognized for ",family, " choose from logit,probit and cloglog!\n")
}
else if (family=="poisson") {
if (link=="log") exp(x)
else cat("link function not recognized for ", family, " chosse from log!\n" )
}
}

"g" <-
function(x,family="normal",link="identity") {
# mu-link function for generalized linear model
if (family=="normal"){
if (link=="identity") x
else cat("link function not recognized for ",family, " choose from identity!\n")}
if (family=="binomial") {
 if (link=="logit") log(x/(1-x))
 else if (link=="probit") qnorm(x)
 else if (link=="cloglog") log(-log(1-x))
 else cat("link function not recognized for ",family, " choose from logit,probitand cloglog!\n")
}
else if (family=="poisson") {
 if (link=="log") log(x)
 else cat("link function not recognized for ", family, " choose from log!\n" )
}
}

"bpp" <-
function(x,family="normal",link="identity") { 
# variance function
if (family=="normal"){
if (link=="identity") rep(1,length(x))
else cat("link function not recognized for ",family, " choose from identity!\n")}
else if (family=="binomial") {
if (link=="logit") exp(x)/(1+exp(x))^2
else if (link=="probit") pnorm(x)*(1-pnorm(x))
else if (link=="cloglog") exp(-exp(x))*(1-exp(-exp(x)))
else cat("link function not recognized for ",family, " choose from logit,probit and cloglog!\n")
}
else if (family=="poisson") {
if (link=="log") exp(x)
else cat("link function not recognized for ", family, " chosse from log!\n" )
}
}

"psi" <-
function(x,family="normal") {
# initial value for dependent variable
if (family=="binomial") return((x+0.5)/2)
else if (family=="poisson") return(log(x+0.5))
else return(x)}

"probcal" <- function(y,eta)
{ # return P(Y=y), now only works for logit link for binomial family
  p <- 0
  for ( i in 1:length(y))
  p[i] <- ifelse(y[i]==1, exp(eta[i])/(1+exp(eta[i])) ,1/(1+exp(eta[i])))
  p
}

##Copyright 2004 B. Ding and R. Gentleman, all rights reserved

glpls1a <- function(X, y, K.prov=NULL, eps=1e-3, lmax=100, b.ini=NULL,
                    denom.eps=1e-20, family="binomial", link=NULL, br=TRUE)
{

  if (is.null(link)) {
    if (family=="normal") link <- "identity"
    else if (family=="binomial") link <- "logit"
    else if (family=="poisson") link <- "log"
    else stop("unknown family", family)
  }

  X <- as.matrix(X)
  dx <- dim(X)

##FIX up Y
  if (is.factor(y)) {
      levs = levels(y)
      if( length(levs) > 2 )
          levs = c(levs[1], "Other")
      y <- y != levels(y)[1]
  }
  else levs = unique(y)
  if(length(levs) > 2)
      warning("y has more than two levels")

  if (any(y < 0 | y > 1))
        stop("y values must be 0 <= y <= 1")

### number of PLS components

  if (is.null(K.prov)) K <- min(dx[1]-1,dx[2])
  else if ( K.prov > min(dx[1]-1,dx[2]) )
    {
      cat("number of dimension K.provd exceeds rank of X. \n","Provide a number that is less than or equal to ",min(dx[1]-1,dx[2]),"!\n");
      K <- min(dx[1]-1,dx[2])
    }
  else
    {
      K <- K.prov
    }

  ## cat("Number of components is:", K,"!\n")

### initialzing matrices for PLS regression

  ## weight matrix for PLS regression
  W<- matrix(0,dx[2],K)

  ## score matrix
  Ta <- matrix(0,dx[1],K)

  ## loading matrix for X
  P <- matrix(0,dx[2],K)

  ## loading matrix for y
  Q <- numeric(K)

### number of iterations
  l <- 0

### intial values

  ## convergence
  converged <- F

  ## initial predictor matrix
  E <- X

  ## pseudo response
  ystar <- psi(y, family)
  ystar0 <- ystar

  ## Weight matrix for GLM, e.g. p(1-p) for logistic regression
  V <- diag(c(hp(ystar,family,link)^2/bpp(ystar,family,link)))

  ## weighted standardization of predictor matrix
  E <- t(E)-apply(E,2,weighted.mean,diag(V))
  E <- t(E)
#  E <- t(E/sqrt(apply(E,1,var)))

  ## linear predictor, ie. eta = Xb
  eta <- rep(0,length(y))
  eta.old <- eta+10

  Q <- rep(0,K)
  Q.old <- rep(100,K)
  K.old <- K
  min <- 1000

  ## regression coefficients
  if(!is.null(b.ini))
    {
      beta.old <- b.ini[-1]
    }
  else{
    beta <- rep(1,ncol(X))
  }

  beta.old <- beta/1000

  while ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps) &
         (l < lmax))     {
     ##    cat("Iter:",l,"\n")

      if ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) < min))
        {
          if(l==1)
            {
              W.min <- W
              P.min <- P
             Q.min <- Q
             }
          else
            {
              ## record minimum values
              min <- max(abs(beta-beta.old)/(abs(beta.old)+denom.eps))
              W.min <- W
              P.min <- P
              Q.min <- Q
              eta.min <- eta
              l.min <- l
            }
        }

      K.old <- K

      l <- l + 1

   #   K <- min(dx[1]-1,dx[2],Rank(E),K.prov)
      W <- matrix(0,dx[2],K)
      Ta <- matrix(0,dx[1],K)
      P <- matrix(0,dx[2],K)
      Q <- numeric(K)

### PLS regression

      for ( i in 1:K) {
        w <- t(E)%*%V%*%ystar
        w <- w/sqrt(crossprod(w)[1])
        W[,i] <- w
        ta <- E%*%w
        ## ta <- ta-weighted.mean(ta,diag(V))
        ## ta <- ta/sqrt(var(ta))[1]
        Ta[,i] <- ta
        taa <- (t(ta)%*%V%*%ta)[1]
        p <- t(E)%*%V%*%ta/taa
        P[,i] <- p
        q <- (t(ystar)%*%V%*%ta/taa)[1]
        Q[i] <- q
        ystar <- ystar - q*ta
        E <- E - ta%*%t(p)
      }

      ## update beta
      beta.old <- beta
      beta <- W%*%solve(t(P)%*%W)%*%Q

      ## Hat matrix
      H <- hat(sweep(cbind(rep(1,dx[1]),X),1,sqrt(diag(V)),"*"),intercept=FALSE)
      ## update eta (linear predictor)
      eta <- weighted.mean(ystar0,diag(V)) + Ta%*%Q

      ## update and rescaling weight matrix
      V <- diag(c(hp(eta,family,link)^2/bpp(eta,family,link)))
      V <- V*(H*br+1)

      ## diagnosis for divergence
      if( sum(diag(V)=="NaN") > 0)
        {
          #print("diagonal elements of V overflow!")
           break
        }
      if (sum(round(probcal(y,eta),4)>=0.9999)==length(y))
        {
           #print("complete separation !")
           break
        }

      ## update pseudo response
     ystar <- eta +
     diag(c(1/hp(eta,family,link)))%*%(y+H*br/2 -
     (H*br+1)*h(eta,family,link))/(H*br +1)
      ystar0 <- ystar

      ## update predictor matrix
      E <- t(X)-apply(X,2,weighted.mean,diag(V))
      E <- t(E)
      ## E <- t(E/sqrt(apply(E,1,var)))

#         print(max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)))
    }

#  print(l)

  if (max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps)
    {
 #     cat("Convergence Not achieved and estimate from iteration ", l.min, " is used!")
        W <- W.min
        P <- P.min
        Q <- Q.min
        eta <- eta.min
    }
  else
    {
      converged <- T
    }

  ## final estimates
  beta <- W%*%solve(t(P)%*%W)%*%Q
  beta0 <- eta[1]-X[1,]%*%beta

  ##put some names onto the coefs
  coef = c(beta0,beta)
  dnx = dimnames(X)[[2]]
  if(length(dnx) != length(beta))
      dnx = paste("X", 1:length(beta), sep=":")
  names(coef) = c("Intercept", dnx)

  ans = list(coefficients=coef,
                convergence = converged,
                niter = l,
                family = family,
                link = link,
                levs = levs,
                bias.reduction = br)
  class(ans) = "gpls"
  ans
}

##let's see if we can put this into a standard modeling framework
gpls = function(x, ...)
    UseMethod("gpls")

gpls.default = function(x, y, K.prov=NULL, eps=1e-3, lmax=100, b.ini=NULL,
denom.eps=1e-20, family="binomial", link=NULL, br=TRUE, ...)
       glpls1a(x, y, K.prov, eps, lmax, b.ini, denom.eps, family,
       link, br)

gpls.formula = function(formula, data, contrasts=NULL, K.prov=NULL,
eps=1e-3, lmax=100, b.ini=NULL, denom.eps=1e-20, family="binomial",
link=NULL, br=TRUE, ...) {
    mf = match.call()
    m = match(c("formula", "data"), names(mf), 0)
    mf = mf[c(1,m)]
    mf[[1]] = as.name("model.frame")
    mf = eval(mf, parent.frame())
    mt = attr(mf, "terms")
    y = model.response(mf, "numeric")
    x = if( !is.empty.model(mt))
           model.matrix(mt, mf, contrasts)
        else matrix(, NROW(y), 0)
    xint = match("(Intercept)", colnames(x), nomatch=0 )
    if(xint > 0 )
        x <- x[, -xint, drop=FALSE]
    ans = glpls1a(x, y, K.prov, eps, lmax, b.ini,
            denom.eps, family, link, br)
    ans$terms = mt
    ans$call = match.call()
    ans
}

print.gpls = function(x, digits = max(3, getOption("digits") - 3), ...)
{
    cat("\nCall:\n", deparse(x$call), "\n\n", sep = "")
    if (length(coef(x))) {
        cat("Coefficients:\n")
        print.default(format(coef(x), digits = digits), print.gap = 2,
            quote = FALSE)
    }
    invisible(x)
}

##Based on predict.lda
predict.gpls = function (object, newdata, ...)
{
    if (!inherits(object, "gpls"))
        stop("object not of class gpls")
    if (!is.null(Terms <- object$terms)) {
        Terms <- delete.response(Terms)
        if (missing(newdata))
            newdata <- model.frame(object)
        else {
            newdata <- model.frame(Terms, newdata, na.action = na.pass,
                xlev = object$xlevels)
            if (!is.null(cl <- attr(Terms, "dataClasses"))) 
                stats::.checkMFClasses(cl, newdata)
        }
        x <- model.matrix(Terms, newdata, contrasts = object$contrasts)
        xint <- match("(Intercept)", colnames(x), nomatch = 0)
        if (xint > 0)
            x <- x[, -xint, drop = FALSE]
    }
    else {
        if (missing(newdata)) {
            if (!is.null(sub <- object$call$subset))
                newdata <- eval.parent(parse(text =
                paste(deparse(object$call$x, backtick = TRUE), "[",
                deparse(sub, backtick = TRUE), ",]")))
            else newdata <- eval.parent(object$call$x)
            if (!is.null(nas <- object$call$na.action))
                newdata <- eval(call(nas, newdata))
        }
        if (is.null(dim(newdata)))
            dim(newdata) <- c(1, length(newdata))
        x <- as.matrix(newdata)
    }
    if (ncol(x) != length(object$coefficients) - 1 )
        stop("wrong number of variables")
    if (length(colnames(x)) > 0 && any(colnames(x) !=
                names(object$coef[-1])) )
        warning("Variable names in newdata do not match those in object")
##FIXME: just copied this from Beiying's stuff - clearly needs a bit
## more thought - why do we strip off the coef when fitting, and then
## add it back in?
    eta <- cbind(rep(1,nrow(x)),x)%*%object$coefficient
    preds = h(eta, object$family, object$link)
    ##we seem to need to do a lot of futzing to get the right levels
    rf = preds > 0.5
    if(all(rf) ) {
        rf = factor(rf, labels=object$levs[2])
        levels(rf) = object$levs
    }
    else if( !any(rf) ) {
        rf = factor(rf, labels=object$levs[1])
        levels(rf) = object$levs
    }
    else
        rf = factor(preds[,1]>0.5, labels=object$levs)
    ans = list(class = rf, predicted = preds)
    ans
}
glpls1a.train.test.error <- function(train.X, train.y, test.X, test.y,
                                     K.prov=NULL, eps=1e-3, lmax=100,
                                     family="binomial", link="logit", br=T)
{
  ## calculate test set error by out-of-sample counting
  ## the input predictor matrices are p by n

  train.X <- t(train.X)
  test.X <- t(test.X)

  if(!is.null(rownames(train.X)) & !is.null(rownames(test.X)))
     {
        test.X <- test.X[match(row.names(train.X),row.names(test.X)),]
     }

  train.mean <- rowMeans(train.X)
  train.sd <- apply(train.X,1,sd)
  train.X <- scale(t(train.X))

  test.X <- t((test.X-train.mean)/train.sd)

  train.fit <- glpls1a(train.X, train.y, K.prov=K.prov, eps=eps,
                      lmax=lmax, family=family, link=link, br=br)
  test.y.predict <- glpls1a.predict(test.X, train.fit$coefficients,
                                       family=family, link=link)
  error.test <- 1-sum((test.y.predict>0.5)==test.y)/length(test.y)
  error.obs <- seq(1:length(test.y))[(test.y.predict>0.5)!=test.y]
  return(list(error=error.test, error.obs = error.obs,
              predict.test=test.y.predict))

}

glpls1a.cv.error <- function(train.X, train.y, K.prov=NULL, eps=1e-3,
lmax=100, family="binomial", link="logit", br=T)
  {
    ## calculate error rate by leaving-one-out CV of training set
    ## the input train.X is p by n

    train.y.predict <- NULL
    for( i in 1:nrow(train.X))
      {
    #    print(i)
        train.fit <- glpls1a(train.X[-i,], train.y[-i], K.prov=K.prov,
        eps=eps,lmax=lmax,family=family,link=link,br=br)
        train.y.predict[i] <-
        glpls1a.predict(matrix(train.X[i,], nrow=1), train.fit$coefficients)
      }

    error.CV <- 1-sum((train.y.predict>0.5)==train.y)/length(train.y)
    error.obs <- seq(1:length(train.y))[(train.y.predict>0.5)!=train.y]
    return(list(error=error.CV,error.obs=error.obs))
  }

glpls1a.error <- function(train.X, train.y, K.prov=NULL, eps=1e-3,
lmax=100, family="binomial", link="logit", br=T)
  {
    ## calculate error rate by leaving-one-out CV of training set
    ## the input train.X is p by n

    train.y.predict <- NULL
    train.fit <- glpls1a(train.X, train.y, K.prov=K.prov, eps=eps,
    lmax=lmax, family=family, link=link, br=br)
    train.y.predict <- glpls1a.predict(train.X, train.fit$coefficients)
    error <- 1-sum((train.y.predict>0.5)==train.y)/length(train.y)
    error.obs <- seq(1:length(train.y))[(train.y.predict>0.5)!=train.y]
    return(list(error=error,error.obs=error.obs))
  }


glpls1a.mlogit.cv.error <- function(train.X, train.y, K.prov=NULL,
eps=1e-3, lmax=100, mlogit=T, br=T)
  {
    ## calculate CV error rate for multinomial logit model
    ## calculate error rate by leaving-one-out CV of training set
    ## the input train.X is p by n
    ## the input train.y is the multinomial outcome, within level 1 being the most frequent

    family <- "binomial"
    link <- "logit"

    n <- nrow(train.X)

    C <- max(train.y)
    train.y.predict <- matrix(0,nrow=n,ncol=C-1)

    if(is.null(K.prov))
      {
        dx <- dim(train.X)
        K.prov <- min(dx[1],dx[2]-1)
      }


    for( i in 1:n)
      {
    #   print(i)
       if(mlogit)
         {
           train.fit <- glpls1a.mlogit(train.X[-i,], train.y[-i],
           K.prov=K.prov, eps=eps, lmax=lmax, br=br)
         }
       else
         {
            train.fit <- glpls1a.logit.all(train.X[-i,], train.y[-i],
            K.prov=K.prov, eps=eps, lmax=lmax, br=br)
          }

       train.y.predict[i,] <-
       glpls1a.mlogit.predict(matrix(train.X[i,],nrow=1), train.fit$coef)
      }

    temp <- t(apply(train.y.predict,1,function(x) x==max(x,1-sum(x))))
    temp <- apply(cbind(apply(temp,1,function(x) !any(x)),temp), 1,
                                        function(x) (1:C)[x])
    error.CV <- 1-sum(temp==train.y)/length(train.y)
    error.obs <- seq(1:length(train.y))[temp != train.y]
    return(list(error=error.CV,error.obs=error.obs))
  }
glpls1a.logit.all <- function(X,y,K.prov=NULL,eps=1e-3,lmax=100,b.ini=NULL,denom.eps=1e-20,family="binomial",link="logit",br=T)
{
  family <-"binomial"
  link <- "logit"

  x <- as.matrix(X)
  yv <- as.numeric(as.vector(y))
  dimnames(x) <- names(yv) <- NULL
  C <- max(yv)-1

  n <- nrow(x)
  r <- ncol(x)+1
  
  y <- matrix(0,n,C+1)
  y[cbind(seq(n),yv)] <- 1
  y0 <- y[,1]
  y <- y[,-1,drop=F]

  beta <- matrix(0,r,C)

  for ( i in 1:C)
    {
      index <- (1:n)[y0 != y[,i]]
      X <- x[index,]
      Y <- as.vector(y[index,i])
      beta[,i] <- glpls1a(X,Y,K.prov=K.prov,eps=eps,lmax=lmax,b.ini=b.ini,denom.eps=denom.eps,family=family,link=link,br=br)$coef
    }
  return(list(coefficients=beta))
}
glpls1a.mlogit <- function(x,y,K.prov=NULL,eps=1e-3,lmax=100,b.ini=NULL,denom.eps=1e-20,family="binomial",link="logit",br=T)
{

  if(family != "binomial" | link != "logit")
    {
      print("wrong family (link) !\n")
      break
    }

  if(any(x[,1] !=1))
    x <- cbind(rep(1,nrow(x)),x)
  
  x <- as.matrix(x)
  yv <- as.numeric(as.vector(y))
  dimnames(x) <- names(yv) <- NULL
  r <- ncol(x)
  C <- max(yv)-1

  n <- nrow(x)
  y <- matrix(0,n,C+1)
  y[cbind(seq(n),yv)] <- 1
  y0 <- y[,1]
  y <- y[,-1]
  jn <- rep(1,n)
  jC <- rep(1,C)
  y2 <- as.vector(t(y))
  x2 <- t(kronecker(rep(1,C),x))
  dim(x2) <- c(r,n,C)
  x2 <- aperm(x2,c(1,3,2))
  dim(x2) <- c(r,n*C)
  x2 <- t(x2)

  X <- matrix(0,nrow=C*n,ncol=C*r)
  for ( i in 1:n)
    {
      for (j in 1:C)
        {
          X[((i-1)*C+j),(((j-1)*r+1):((j-1)*r+r))] <- x[i,]
        }
    }
  
  dx <- dim(X)
  
  if (is.null(K.prov)) K <- min(dx[1]-1,dx[2])
  else if ( K.prov > min(dx[1]-1,dx[2]) )
    {
      cat("number of dimension K.provd exceeds rank of X. \n","Provide a number that is less than or equal to ",min(dx[1]-1,dx[2]),"!\n"); break
    }
  else
    {
      K <- K.prov
    }

#  cat("Number of components is:", K,"!\n")

### initialzing matrices for PLS regression

  ## weight matrix for PLS regression
  W<- matrix(0,dx[2],K)

  ## score matrix
  Ta <- matrix(0,dx[1],K)

  ## loading matrix for X
  P <- matrix(0,dx[2],K)

  ## loading matrix for y
  Q <- numeric(K)


### intial values

  ## convergence
  converged <- F
  
  ## initial predictor matrix
  E <- X

  ## pseudo response
  ystar <- psi(y2,family)
  ystar0 <- ystar 

  ## Weight matrix for GLM
  V <- diag(abs(rnorm(C*n)),C*n)

  ## linear predictor, ie. eta = Xb
  eta.old <- rep(1,length(y2))
  eta <- eta.old+100

  Q <- rep(0,K)
  Q.old <- rep(100,K)
  K.old <- K
  min <- 1000
  
  ## regression coefficients
  if(!is.null(b.ini))
    {
      beta <- b.ini
    }
  else{
    beta <- rep(1,C*r)
  }

  beta.old <- beta/1000

  ## iteration index
  l <- 0
  l.min <- 1

  while ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps) & (l < lmax))
    {
  #    cat("Iter:",l,"\n")

      if ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) < min))
      {
          if(l==1)
            {
              W.min <- W
              P.min <- P
              Q.min <- Q
             }
           else
            {
       
          ## record minimum values
          min <- max(abs(beta-beta.old)/(abs(beta.old)+denom.eps))
          W.min <- W
          P.min <- P
          Q.min <- Q
          eta.min <- eta
          l.min <- l
        }
        }


      K.old <- K
     
      l <- l + 1

   #   K <- min(dx[1]-1,dx[2],Rank(E),K.prov)
      W <- matrix(0,dx[2],K)
      Ta <- matrix(0,dx[1],K)
      P <- matrix(0,dx[2],K)
      Q <- numeric(K)

       
      ### PLS regression
              
      for ( i in 1:K) {
        w <- t(E)%*%V%*%ystar
        w <- w/sqrt(crossprod(w)[1])
        W[,i] <- w
        ta <- E%*%w
        ## ta <- ta-weighted.mean(ta,diag(V))
        ## ta <- ta/sqrt(var(ta))[1]
        Ta[,i] <- ta
        taa <- (t(ta)%*%V%*%ta)[1]
        p <- t(E)%*%V%*%ta/taa
        P[,i] <- p
        q <- (t(ystar)%*%V%*%ta/taa)[1]
        Q[i] <- q
        ystar <- ystar - q*ta
        E <- E - ta%*%t(p)
       
      }
       
      
       ## update beta
    
      temp <- ginv(t(P)%*%W)
      if(any(is.na(temp)))
        {
          #cat("t(P)%*%W singular!\n")
          break
        }
      else
        {
          beta.old <- beta
          beta <- W%*%temp%*%Q
        }
    #   beta <- W%*%ginv(t(P)%*%W)%*%Q  

      ## Hat matrix
      H <- V%*%X%*%ginv(t(X)%*%V%*%X)%*%t(X)

      ## update eta (linear predictor)

      eta <- as.vector(t(x%*%matrix(beta,ncol=C,byrow=F)))
      p <-exp(x%*%matrix(beta,ncol=C ,byrow=F))
     
      den.p <- 1+as.vector(p%*%jC)
      
       if(any(is.infinite(den.p)))
        {
          #cat("Infinite denom for p!\n")
          break
        }
      p <- p2 <- p/den.p
      dim(p2) <- c(n,C)
      p2 <- as.vector(t(p2))

      ## update and rescaling weight matrix
      V <- matrix(0, C*n, C*n)
      for(j in seq(n)) {
        
        V[((j - 1) * C + 1):(j * C), ((j - 1) * C + 1):(j * C)] <- diag(p[j,],length(p[j,]))-matrix(p[j,],nrow=C,ncol=1)%*%matrix(p[j,],nrow=1,ncol=C)

      }
    
       ## diagnosis for divergence
      
    ## d_eta/d_mu matrix
      detadmu <- V
     	 for ( j in seq(n))
           {
             for (i in ((j - 1) * C + 1):(j * C))
               for (k in (i:(j*C)))
                 {
                   sum <- sum(p2[((j-1)*C+1):(j*C)])
                   if(i==k)
                     {
                     
                       detadmu[i,k] <- (1-sum+p2[k])/(p2[k]*(1-sum))
                         
                     }
                   else
                     {
                       detadmu[i,k] <- 1/(1-sum)
                       detadmu[k,i] <- detadmu[i,k]
                     }

                 }
           }
      
      Hw <- NULL
      for ( j in seq(n))
        {
          sum <- sum(diag(H)[((j - 1) * C + 1):(j * C)])
          for (i in ((j - 1) * C + 1):(j * C)) 
            {
              Hw[i] <- sum + diag(H)[i]
            }
        }
      
    ## update pseudo response
      if(any(is.na(detadmu)))
        {
          #cat("V singular\n")
          break
        }
     
      ystar <- eta + detadmu%*%(y2+diag(H)*br/2-(Hw*br/2+1)*p2)/(Hw*br/2+1)
      ystar0 <- ystar
      V <- V*(Hw*br/2+1)

      ## update predictor matrix
      E <- X
     
#      print(max(abs(beta-beta.old)/abs(beta.old)))
    }

 # print(l)
  if (max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps)
    {
      
   #   cat("Convergence Not achieved and estimate from iteration ", l.min, " is used!\n")
      W <- W.min
      P <- P.min
      Q <- Q.min
    }
  else
    {
      converged <- T
    }
  ## final estimates
  beta <- W%*%ginv(t(P)%*%W)%*%Q

return( list(coefficients=matrix(beta,ncol=C ,byrow=F),     
              convergence = converged,
              niter = l,bias.reduction =br))
       
}
#replaced by the predict method for gpls
glpls1a.predict <- function(X, beta, family="binomial", link="logit")
{
  if(all(X[,1] == rep(1,nrow(X))))
   {
     eta <- X%*%beta
   }else{
     eta <- cbind(rep(1,nrow(X)),X)%*%beta
  }
  return(h(eta,family,link))
}

glpls1a.mlogit.predict <- function(X,beta)
  {
    ## prediction for multinomial logit model
    ## beta is the p by J-1 matrix where J is the levels of categories of the outcomoe
    
    if(all(X[,1] == rep(1,nrow(X))))
      {
        eta <- X%*%beta
      }else
      {
        eta <- cbind(rep(1,nrow(X)),X)%*% beta
      }

    p <- exp(eta)
    p.denom <- apply(p,1,function(x) 1+sum(x))
      
    return(p/p.denom)
  }
ginv <- function (X, tol = sqrt(.Machine$double.eps))
{
    if (length(dim(X)) > 2 || !(is.numeric(X) || is.complex(X)))
        stop("X must be a numeric or complex matrix")
    if (!is.matrix(X))
        X <- as.matrix(X)
    if(any(is.na(X))) return(NA)
    Xsvd <- svd(X)
    if (is.complex(X))
        Xsvd$u <- Conj(Xsvd$u)
    Positive <- Xsvd$d > max(tol * Xsvd$d[1], 0)
    if(any(is.na(Positive)))
        return(NA)
    else{
	if (all(Positive))
        Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u))
    else if (!any(Positive))
        array(0, dim(X)[2:1])
    else Xsvd$v[, Positive, drop = FALSE] %*% ((1/Xsvd$d[Positive]) *
        t(Xsvd$u[, Positive, drop = FALSE]))
    }
}

