Newton Boosting

Going through Friedman’s paper on Gradient Boosting (http://statweb.stanford.edu/~jhf/ftp/trebst.pdf) is a revelation that few technical work can match. It changes the way you look at machine learning problem formulations and endless possibilities emerge.

Recall the basic steps of gradient boosting. One has to find the right direction to find a minimum of the loss function $L(y, F(\mathbf{x}))$. The direction is given by the gradient of the loss function, $\nabla L_F$. This gradient is a new ‘pseudo-response’ that is used as a response variable to fit a new regression model. The optimal step in this direction is found by solving a line-search problem, i.e., $\rho^{*} = arg min_\rho L(y, F_{m-1}(\mathbf{x}) + \rho F(\mathbf{x}))$.

What are the other options? Two alternatives immediately come to mind. Both are based on iterative steps followed in Newton’s methods. For Newton-Raphson method, this is

$x_{n+1} = x_n - \underbrace{f(x_n)/f'(x_n)}$

and for second-order approximation

$x_{n+1} = x_n - \underbrace{f'(x_n)/f''(x_n)}$

Thus, instead of the original one in Friedman’s paper we can use these directions, i.e., regression models at each step would fit (1) $-L(y, \mathbf{x})/L'(y, \mathbf{x})$ and (2) $-L'(y, \mathbf{x})/L''(y, \mathbf{x})$. Note, these are just the directions and then to find the optimal steps in these directions we still need to carry out the line-search step to find the optimal step-size, $\rho^*$. Below is the source code for boosting with all the three options.


nbm_v0 <- function(train.data, test.data, M, lambda, TreeParams,
modelType = "gbm", verbose=FALSE){

names.x <- setdiff(names(train.data), "y")
x <- subset(train.data, select=names.x)

# Should use "anova" and not "class"
h0 <- rpart(y ~ ., data=train.data, method="anova", control=TreeParams)
yp <- predict(h0, train.data, type="vector") # raw score
pred <- sapply(yp, function(x) ifelse(x < 0, -1, 1))
t0 <- table(train.data$y, pred) print(t0) print(binaryEvalMetric(t0)) p.test <- predict(h0, test.data, type = "vector") p.test <- sapply(p.test, function(x) ifelse(x < 0, -1, 1)) t.test <- table(test.data$y, p.test)
print(binaryEvalMetric(t.test))

F0 <- sum(apply(as.matrix(cbind(train.data$y, yp)), 1, Loss )) print(sprintf("Initial Cost & accuracy: %g, %g",F0, sum(diag(t0))/sum(t0))) models <- vector(mode = "list", length=(M+1)) rvec <- vector(mode = "numeric", length=M) loss <- vector(mode = "numeric") models[[1]] <- h0 loss[1] <- F0 for (m in 1:M){ # Compute the pseudo-response (negative gradient for gradient boosting) if (modelType == "gbm"){ yt <- apply(as.matrix(cbind(train.data$y, yp)), 1, evalGrad )
} else if (modelType == "nbm_1") {
yt <- apply(as.matrix(cbind(train.data$y, yp)), 1, NewtonFun ) } else if (modelType == "nbm_2"){ yt <- apply(as.matrix(cbind(train.data$y, yp)), 1, NewtonFun2 )
}

# Fit a new model for the pseudo-response
model <- rpart(yt ~ ., data=data.frame(cbind(x,yt)), method="anova",
control=TreeParams)
models[[m+1]] <- model

# Get the new response
yc <- predict(model, x, type="vector")

# Line Search
ucobject <- ucminf(par=1e-03, fn=Lsearch, gr=NULL, train.data$y, yp, yc, control = list(grtol = 1e-12)) rho <- ucobject$par
# print(ucobject$message) rvec[m] <- rho # New prediction yp <- yp + lambda*rho*yc Fm <- sum(apply(as.matrix(cbind(train.data$y, yp)), 1, Loss ))
print(sprintf("Tree # %d: Current Cost = %g | rho = %g",m+1, Fm, rho))
loss <- c(loss, Fm)

if (abs(Fm-F0) < 1e-03){
print("No improvement from line search: exiting")
break
} else {
F0 <- Fm
}

}
print(sprintf("Built %d trees", (m+1)))

xtry <- seq(from=-0.9, to=0.5, length.out=150)
f1val <- vector("numeric")
for(xx in xtry){
f1val <- c(f1val, F1score(xx, train.data$y, yp)) } # plot(xtry, f1val) threshold <- xtry[which.max(f1val)] print(sprintf("Threshold for maximum F1-score is %g", threshold)) pred <- sapply(yp, function(x) ifelse(x < threshold, -1, 1)) t0 <- table(train.data$y, pred)
print(t0)
print(sprintf("Final accuracy (training data): %g", sum(diag(t0))/sum(t0)))

p0 <- predict(models[[1]], test.data, type = "vector")
pred0 <- sapply(p0, function(x) ifelse(x < threshold, -1, 1))
t00 <- table(test.data$y, pred0) print(t00) print(binaryEvalMetric(t00)) for (ii in 1:m){ print(sprintf("Evaluating %d out of %d trees", ii, m)) p0 <- p0 + lambda*rvec[ii]*predict(models[[(ii+1)]], test.data, type = "vector") } pred <- sapply(p0, function(x) ifelse(x < threshold, -1, 1)) t0 <- table(test.data$y, pred)
print(t0)
print(binaryEvalMetric(t0))
print(sprintf("Test accuracy: %g", sum(diag(t0))/sum(t0)))
plot(loss, type="b")
}



The functions used in this code, namely, evalGrad, NewtonFun and NewtonFun2 are given below:


# negative of the derivative of the loss function with respect to the prediction
# Note: x[1] = actual label, y; x[2] = prediction, F

# numerator = the prediction itself

# denominator = negative of the derivative of the loss function

# with respect to the prediction
# Note: x[1] = actual label, y; x[2] = prediction, F

# Note: x[1] = actual label, y; x[2] = prediction, F



whereas the line-search and associated functions are:


# function to find the right increment in the current prediction (yc) direction
Lsearch <- function(rho, y, Fm, yc){
yp <- Fm + rho*yc
val <- sum(apply(as.matrix(cbind(y, yp)), 1, Loss ))
return(val)
}

# negative Binomial log-likelihood: log(1+exp(-2*y*F))
Loss <- function(x){log(1+exp(-2*x[1]*x[2]))}