## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno ## ## This file is part of Octave. ## ## Octave is free software; you can redistribute it and/or modify it ## under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2, or (at your option) ## any later version. ## ## Octave is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Octave; see the file COPYING. If not, write to the Free ## Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA ## 02110-1301, USA. ## -*- texinfo -*- ## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}]) ## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X} ## with the model @code{y = X * beta + e}. ## ## Here, ## ## @itemize ## @item ## @code{y} is a column vector of observed values ## @item ## @code{X} is a matrix of regressors, with the first column filled with ## the constant value 1 ## @item ## @code{beta} is a column vector of regression parameters ## @item ## @code{e} is a column vector of random errors ## @end itemize ## ## Arguments are ## ## @itemize ## @item ## @var{y} is the @code{y} in the model ## @item ## @var{X} is the @code{X} in the model ## @item ## @var{alpha} is the significance level used to calculate the confidence ## intervals @var{bint} and @var{rint} (see `Return values' below). If not ## specified, ALPHA defaults to 0.05 ## @end itemize ## ## Return values are ## ## @itemize ## @item ## @var{b} is the @code{beta} in the model ## @item ## @var{bint} is the confidence interval for @var{b} ## @item ## @var{r} is a column vector of residuals ## @item ## @var{rint} is the confidence interval for @var{r} ## @item ## @var{stats} is a row vector containing: ## ## @itemize ## @item The R^2 statistic ## @item The F statistic ## @item The p value for the full model ## @item The estimated error variance ## @end itemize ## @end itemize ## ## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize ## the residual intervals and identify outliers. ## ## NaN values in @var{y} and @var{X} are removed before calculation begins. ## ## @end deftypefn ## References: ## - Matlab 7.0 documentation (pdf) ## - 《大学数学实验》 姜启源 等 (textbook) ## - http://www.netnam.vn/unescocourse/statistics/12_5.htm ## - wsolve.m in octave-forge ## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf function [b, bint, r, rint, stats] = regress (y, X, alpha) if (nargin < 2 || nargin > 3) usage ("[b, bint, r, rint] = regress (y, X, [alpha])"); endif if (! ismatrix (y)) error ("regress: y must be a numeric matrix"); endif if (! ismatrix (X)) error ("regress: X must be a numeric matrix"); endif if (columns (y) != 1) error ("regress: y must be a column vector"); endif if (rows (y) != rows (X)) error ("regress: y and X must contain the same number of rows"); endif if (nargin < 3) alpha = 0.05; elseif (! isscalar (alpha)) error ("regress: alpha must be a scalar value") endif notnans = ! logical (sum (isnan ([y X]), 2)); y = y(notnans); X = X(notnans,:); [Xq Xr] = qr (X, 0); pinv_X = Xr \ Xq'; b = pinv_X * y; if (nargout > 1) n = rows (X); p = columns (X); dof = n - p; t_alpha_2 = t_inv (alpha / 2, dof); H = X * pinv_X; r = (eye (n) - H) * y; SSE = sum (r .^ 2); v = SSE / dof; # c = diag(inv (X' * X)) using (economy) QR decomposition # which means that we only have to use Xr c = diag (inv (Xr' * Xr)); db = t_alpha_2 * sqrt (v * c); bint = [b + db, b - db]; endif if (nargout > 3) dof1 = n - p - 1; h = diag (H); # From Matlab's documentation on Multiple Linear Regression, # sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h)); # dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h)); # Substitute # norm (r) ^ 2 == sum (r .^ 2) == SSE # -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2 # We get # sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1; # dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h)); # Combine, we get # dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1); dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1); rint = [r + dr, r - dr]; endif if (nargout > 4) R2 = 1 - SSE / sum ((y - mean (y)) .^ 2); # F = (R2 / (p - 1)) / ((1 - R2) / dof); F = dof / (p - 1) / (1 / R2 - 1); pval = 1 - f_cdf (F, p - 1, dof); stats = [R2 F pval v]; endif endfunction %!test %! % Longley data from the NIST Statistical Reference Dataset %! Z = [ 60323 83.0 234289 2356 1590 107608 1947 %! 61122 88.5 259426 2325 1456 108632 1948 %! 60171 88.2 258054 3682 1616 109773 1949 %! 61187 89.5 284599 3351 1650 110929 1950 %! 63221 96.2 328975 2099 3099 112075 1951 %! 63639 98.1 346999 1932 3594 113270 1952 %! 64989 99.0 365385 1870 3547 115094 1953 %! 63761 100.0 363112 3578 3350 116219 1954 %! 66019 101.2 397469 2904 3048 117388 1955 %! 67857 104.6 419180 2822 2857 118734 1956 %! 68169 108.4 442769 2936 2798 120445 1957 %! 66513 110.8 444546 4681 2637 121950 1958 %! 68655 112.6 482704 3813 2552 123366 1959 %! 69564 114.2 502601 3931 2514 125368 1960 %! 69331 115.7 518173 4806 2572 127852 1961 %! 70551 116.9 554894 4007 2827 130081 1962 ]; %! % Results certified by NIST using 500 digit arithmetic %! % b and standard error in b %! V = [ -3482258.63459582 890420.383607373 %! 15.0618722713733 84.9149257747669 %! -0.358191792925910E-01 0.334910077722432E-01 %! -2.02022980381683 0.488399681651699 %! -1.03322686717359 0.214274163161675 %! -0.511041056535807E-01 0.226073200069370 %! 1829.15146461355 455.478499142212 ]; %! Rsq = 0.995479004577296; %! F = 330.285339234588; %! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)]; %! alpha = 0.05; %! [b, bint, r, rint, stats] = regress (y, X, alpha); %! assert(b,V(:,1),3e-6); %! assert(stats(1),Rsq,3e-13); %! assert(stats(2),F,3e-8); %! assert(((bint(:,1)-bint(:,2))/2)/t_inv(alpha/2,9),V(:,2),7.6e-3);