""" Rpy glm example: Shows how to use glm (with family=binomial) to predict the probability of a binary (yes or no) outcome. I use it to produce probability forecasts from a weather prediction model - this a simple toy example that illustrates the process. Jeff Whitaker 20030801 """ import sys from Numeric import * from RandomArray import * from MLab import * import sys sys.path.insert(1, "..") from rpy import * # specify correlation between forecast and observations. fcstcorr = 0.5 # specify link function for glm (logit, probit or cloglog) link = 'logit' # specify exceedance threshold for probability forecast thresh = 0. # specify number of realizations. nfcsts = 10000 # compute climatological probabaility of exceeding threshold. climprob = r.pnorm(thresh,0.,1.)*ones(nfcsts,'f') print 'fcstcorr = ',fcstcorr print 'link = ',link print 'thresh = ',thresh print 'climo prob = ',max(climprob) print 'nfcsts = ',nfcsts # generate realizations from multivariate normal distribution # # obsvar fcstcov # with cov matrix = # fcstcov fcstvar # # obsvar is fixed to 1, fcstcov = fcstcorr*sqrt(fcstvar). # so only free parameter is fcstcorr. fcstvar = fcstcorr**2 fcstcov = fcstcorr*sqrt(fcstvar) # repeat the process 1000 times. for nrun in range(1000): print 'run ',nrun x = multivariate_normal([0.,0.], [[fcstcov,fcstvar],[1.,fcstcov]], nfcsts) # Now have two time series that are correlated at r=fcstcorr. # Call one the 'observations' and one the 'forecast' obs = x[:,0] fcst = x[:,1] # train the glm (fit the model) binaryobs = obs > thresh d = r.data_frame(x=fcst.tolist(),y=binaryobs.tolist()) formula = r("y ~ x") family = r("family=binomial(link="+link+")") model = with_mode(NO_CONVERSION,r.glm)(formula,family=family,data=d) print 'model coefficients = ',r.coef(model) # independent data for verification x = multivariate_normal([0.,0.], [[fcstcov,fcstvar],[1.,fcstcov]], nfcsts) obs = x[:,0] fcst = x[:,1] # predict the probability that fcst > thresh, using # the glm on independent data. d = r.data_frame(x=fcst.tolist()) fcstprob = array(r.predict(model, newdata=d, type="response")) # calculate brier skill score of the forecasts. # bss = 0: forecasts are no better than climatology # bss = 1: forecasts are perfect. verif = obs > thresh bs = mean((fcstprob - verif)**2) bsclim = mean((climprob - verif)**2) bss = 1.-(bs/bsclim) print "brier skill score = ",bss # try to delete the objects to free memory (doesn't work) del model del d