Reason for disagreement between glmnet results from Matlab and R

1 view (last 30 days)

My question is about comparing glmnet results from Matlab and R.

The following code estimates $\beta$ using the glmnet function in Matlab.

clear;
load acetylene;
X = [x1 x2 x3];
cy = y - mean(y);
% write files as csv so that we can import them in R
csvwrite('testX.csv', X);
csvwrite('testY.csv', y);
% the lambda seq
lseq = exp(linspace(2, 8, 16));
csvwrite('testLam.csv', lseq)
% fit using glmnet
[B fitInfo] = lasso(X, cy, 'Lambda', fliplr(lseq), 'Alpha', 0.55, 'Standardize', false); % notice 'Standardize', false %
% compare the zeros with R's zeros later
csvwrite('matlab_b.csv', B)

The R version to accomplish this is as follows.

rm(list = ls())
xmat <- as.matrix(read.csv("testX.csv", header = FALSE))
colnames(xmat) <- c("x1", "x2", "x3")
ymat <- as.matrix(read.csv("testY.csv", header = FALSE))
yvec <- as.numeric(ymat)
cyvec <- yvec - mean(yvec)
lseq <- as.numeric(unlist(read.csv("testLam.csv", header = FALSE)))
library(glmnet)
fits <- glmnet(xmat, cyvec, family = "gaussian", alpha = 0.55, lambda = rev(lseq), standardize = FALSE)
bb <- as.matrix(fits$beta)[, seq(16, 1, by = -1)]
mbb <- as.matrix(read.csv("matlab_b.csv", header = FALSE))

We see that while the magnitudes differ slightly across the λ sequence, the number of coefficients set to 0 agree.

# check if #zeros are equal
all.equal(which(mbb == 0), which(bb == 0))
# [1] TRUE
all.equal(as.vector(mbb), as.vector(bb))
# [1] "Mean relative difference: 0.03296765"

The results, however, disagree significantly if I try to do what I think matlab is trying to do (assuming I understand the documentation correctly).

clear;
load acetylene;
X = [x1 x2 x3];
% standardize and center
sxmat = (X - repmat(mean(X, 1), size(X, 1), 1)) ./ repmat(std(X, 0, 1), size(X, 1), 1);
% center ys
cy = y - mean(y);
% write files
csvwrite('testSX.csv', sxmat);
csvwrite('testCY.csv', cy);
% the lambda seq
lseq = exp(linspace(2, 8, 16));
csvwrite('testLam.csv', lseq)
% fit using glmnet and without standardization.
[B fitInfo] = lasso(sxmat, cy, 'Lambda', fliplr(lseq), 'Alpha', 0.55, 'Standardize', false);
% save beta for comparison 
csvwrite('matlab_sb.csv', B)

Now repeat this process in R.

rm(list =ls())
sxmat <- as.matrix(read.csv("testSX.csv", header = FALSE))
colnames(sxmat) <- c("x1", "x2", "x3")
ymat <- as.matrix(read.csv("testCY.csv", header = FALSE))
cyvec <- as.numeric(ymat)
lseq <- as.numeric(unlist(read.csv("testLam.csv", header = FALSE)))
library(glmnet)
fits <- glmnet(sxmat, cyvec, family = "gaussian", alpha = 0.55, lambda = rev(lseq), standardize = FALSE)
sbb <- as.matrix(fits$beta)[, seq(16, 1, by = -1)]
smbb <- as.matrix(read.csv("matlab_sb.csv", header = FALSE))
The results are now wildly different.
all.equal(as.vector(smbb), as.vector(sbb))
# [1] "Mean relative difference: 1.703174"
The entries that are 0s still agree, however.
all.equal(which(as.vector(smbb) == 0), which(as.vector(sbb) == 0))
[1] TRUE
I am sure I am missing something but unable to spot the reason. (Possibly, default standardization of y by default in R and not so in Matlab?)

Answers (0)

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!