bic.logit <- function(x, y, n = rep(1, length(y)), strict = F, OR = 20, OR.fix = 2, nbest = 150, scale = 1, prior.param = c(rep(0.5, ncol(x)))) { ### S-PLUS function to apply Bayesian Model Averaging to variable selection for ### logistic regression. Bayesian Model Averaging accounts for the model ### uncertainty inherent in the variable selection problem by averaging over the ### best models in themodel class according to posterior model probability. ### For more details see references at the end of the documentation. ### ### Version 2.0: November 20, 1996 by Chris T. Volinsky and Adrian E. Raftery ### (Version 1.0: October 15, 1994 Adrian E. Raftery) ### ### Developers: Adrian Raftery (raftery@stat.washington.edu) and Chris T. Volinsky ### (volinsky@stat.washington.edu) . ### This software is not formally maintained, but we will be happy to ### hear from people who have problems with it, although we cannot guarantee to ### to solve them! ### Permission is hereby given to StatLib to redistribute this software. ### The software can be freely used for non-commercial purposes, and can ### be freely distributed for non-commercial purposes only. ### The copyright is retained by the developer. ### Copyright 1994 Adrian E. Raftery; 1996 Adrian E. Raftery and Chris T. Volinsky ### Inputs: ### x matrix of independent variables ### y dependent variable ### n the binomial denominator for logistic regression ### (default all 1's) ### strict = T will return a more parsimonious set of models (Occam's Window). ### Any model with a more likely submodel will be eliminated. ### = F (default). Returns all models within 1/OR in posterior model prob. ### OR maximum ratio for excluding models in the averaging window ### (default 20). The function will average over a ### set of models including the best model and all ### models with a posterior model probability of at ### least 1/OR of the best model. ### OR.fix Width of the window which keeps models after the leaps ### approximation is done. Because the leaps and bounds gives only an ### approximation to BIC, there is a need to increase the window at this ### first "cut" so as to assure that no good models are deleted. The ### scale glim scale parameter. Default is 1. ### if "deviance", the deviance scale estimate is used ### if "test", a chisquare test for overdispersion is done first ### with sig. level corresponding to BIC for that sample size ### nbest set the nbest parameter for the leaps function (default = 150) ### prior.param p-vector of prior probabilities that parameter is non-zero. Default puts ### a prior of .5 on all parameters. Setting to 1 forces variables into the ### model. ### ### Outputs: ### postprob posterior probabilities of the models selected ### label labels identifying the models selected ### dev deviance for each model selected ### df degrees for each model selected ### bic BIC = dev/scale - df * log(n). Smaller BIC means higher posterior model ### probability. ### (Note: BIC returned does not take into account model priors, ### so rank order of BIC may not be the same as the rank ### order of postprob if non-uniform parameter priors are ### specified) ### size the number of independent variables in each of the models ### which a logical matrix of size length(label) x size ### probne0 posterior probability that each variable is non-zero (in %) ### postmean model-averaged posterior mean of the parameters (includes averaged-in ### zeros) ### postsd model-averaged posterior sd of the parameters (includes averaged-in zeros). ### mle n x p matrix giving the maximum likelihood estimate of each coefficient ### for each model. ### prior.param the priors on the parameters. ### prior.model.weights ### If priors on variables are not all .5, then prior weights gives the ratio ### of the model prior to a uniform model prior, based on prior.param. A ### prior.model.weight > 1 means the model is weighed more than uniform ### se n x p matrix giving the standard errors of each coefficient for each model. ### scale the scale estimate used, as defined in "glim" ### References: ### Raftery, Adrian E. (1995). Bayesian model selection in social research ### (with Discussion). Sociological Methodology 1995 (Peter V. Marsden, ed.), ### pp. 111-196, Cambridge, Mass.: Blackwells. ### An earlier version, issued as Working Paper 94-12, Center for Studies in Demography and ### Ecology, University of Washington (1994) is available as a Postscript file at ### http://www.stat.washington.edu/tech.reports/bic.ps ### It is also available via ftp using the following commands: ### ftp ftp.stat.washington.edu (or 128.95.17.34) ### login as anonymous ### enter your email address as your password ### ftp>cd /pub/tech.reports ### ftp>get bic.ps ### ftp>bye ### ### Furnival, G.M. and R.W. Wilson (1974) "Regression by leaps and bounds" ### Technometrics 16, 499-511. ### ### Lawless, J. and K. Singhal (1978) "Efficient screening on nonnormal ### regression models. Biometrics 34 318-27. leaps.bl <- function(info, coef, names.arg, nbest = nbest) { ## A modifcation of the S-plus leaps function to employ the ## Lawless/Singhal adjustment names.arg <- names.arg # evaluate names.arg before cbind if(is.null(names.arg)) names.arg <- c(as.character(1:9), LETTERS, letters)[1: ncol(info)] if(length(names.arg) < ncol(info)) stop("Too few names") bIb <- coef %*% info %*% coef kx <- ncol(info) maxreg <- nbest * kx if(kx < 3) stop("Too few independent variables") if(kx >= 31) stop("Problem too large") imeth <- 1 # Corresponds to R^2 df <- kx + 1 # df not important for r2 Ib <- info %*% coef rr <- cbind(info, Ib) rr <- rbind(rr, c(Ib, bIb)) ans <- .Fortran("leaps", ## Fortran routine as.single(rr), as.integer(kx), as.integer(kx + 1), as.integer(df), as.integer(imeth), as.integer(nbest), as.single(0.0001), regid = integer(maxreg), Cp = single(maxreg), size = integer(maxreg), nreg = integer(1), single((nbest + 4) * (kx + 1) + (((kx + 1) * (kx + 2))/ 2) * ((2 * (kx + 3))/3 + 1)), integer(4 * (kx + 1)^2 + 8 * (kx + 1) + (nbest + 1) * (kx + 1)))[c("Cp", "size", "nreg", "regid")] ### cleanup Fortran output; fix Cp, r2 and adjr2. nreg <- ans$nreg Cp <- ans$Cp length(Cp) <- nreg size <- ans$size length(size) <- nreg which <- matrix(T, nreg, kx) z <- ans$regid length(z) <- nreg ### Rick Becker's mask function to create which and then labels which <- matrix(as.logical((rep.int(z, kx) %/% rep.int(2^((kx - 1):0), rep.int(length(z), kx))) %% 2), byrow = F, ncol= kx) label <- character(nreg) sep <- if(all(nchar(names.arg) == 1)) "" else "," for(i in 1:nreg) label[i] <- paste(names.arg[which[i, ]], collapse = sep) ans <- list(Cp, size = size, label = label, which = which) names(ans)[1] <- "r2" ans } ### end of leaps.bl ### Set original names of x-variables, if there are none to start with. names.x <- paste("X",1:ncol(x),sep="") prior.weight.denom <- .5^ncol(x) names.arg <- dimnames(x)[[2]] if(is.null(dimnames(x)[[2]])) names.arg <- names.x if (is.null(dimnames(x)[[1]])) dimnames(x)[[1]]<-paste("R",1:nrow(x),sep="") x2 <- na.omit(data.frame(x)) used <- match(dimnames(x)[[1]], dimnames(x2)[[1]]) omitted <- seq(nrow(x))[is.na(used)] if(length(omitted) > 0) { warning(paste("there were ",length(omitted),"records deleted due to NA's")) n <- n[ - omitted] x <- data.matrix(x2) y <- y[ - omitted] } cdf <-cbind.data.frame(y=y,x) mm <- model.matrix(formula(cdf),data=cdf)[,-1,drop=F] x <- mm glim.out <- glim(x, y, n, error = "binomial", link = "logit") if(scale == 1) scale.est <- 1 if(scale == "deviance") scale.est <- max(glim.out$deviance[2]/glim.out$df[2], 1) if(scale == "test") { alpha <- 2 * pnorm( - sqrt(log(N))) # Sig. level based on BIC P <- 1 - pchisq(glim.out$deviance[2], glim.out$df[2]) if(P < alpha) scale.est <- max(glim.out$deviance[2]/glim.out$df[2], 1 ) else scale.est <- 1 } ### If there are more than 30 columns in x, reduce to 30 using backwards ### elimination. if(ncol(x) > 30) { while(ncol(x) > 30) { glim.out <- glim(x, y, n, error = "binomial", link = "logit", scale = scale.est) coef <- glim.out$coef se <- sqrt(diag(glim.out$var)) pvec <- signif(1 - pchisq((coef/se)^2, 1), 3) x <- x[, - rev(order(pvec))[1]] } } names.arg <- dimnames(x)[[2]] glim.out <- glim(x, y, n, error = "binomial", link = "logit", scale = scale.est) coef <- glim.out$coef[-1] info <- solve(glim.out$var[-1, -1]) if(ncol(x) > 2) { glim.out <- glim(x, y, n, error = "binomial", link = "logit", scale = scale.est) a <- leaps.bl(info = info, coef = coef, names.arg = names.arg, nbest = nbest) a$r2 <- pmin(pmax(0, a$r2), 99.9) # Include the null model a$r2 <- c(0, a$r2)/100 a$size <- c(0, a$size) a$label <- c("NULL", a$label) a$which <- rbind(rep(F, ncol(x)), a$which) nobs <- length(y) ncases <- sum(n) nmod <- length(a$size) prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), byrow = T) prior <- apply(a$which * prior.mat + (!a$which) * (1 - prior.mat), 1, prod) bIb <- as.numeric(coef %*% info %*% coef) lrt <- bIb - (a$r2 * bIb) bic <- lrt + (a$size) * log(ncases) - 2 * log(prior) ### Eliminate models with little evidence occam <- bic - min(bic) < 2 * OR.fix * log(OR) ## From MUS p.24 with OR=B_12 r2 <- a$r2[occam] size <- a$size[occam] label <- a$label[occam] which <- a$which[occam, ,drop=F] bic <- bic[occam] prior <- prior[occam] ### Calculate BIC exactly for the remaining models using "glim", model.fits <- as.list(rep(0, length(label))) dev <- rep(0, length(label)) df <- rep(0, length(label)) for(k in (1:length(label))) { if(sum(which[k, ]) == 0) glim.out <- glim(rep(1, nobs), y, n, error = "binomial", link = "logit", intercept = F, scale = scale.est) else { x.mat <- matrix(x[, which[k, ]], nrow = nobs, dimnames = list(NULL, names.arg[which[k, ]]) ) glim.out <- glim(x.mat, y, n, error = "binomial", link = "logit", scale = scale.est ) } dev[k] <- glim.out$deviance[2] df[k] <- glim.out$df[2] bic[k] <- dev[k]/scale.est - df[k] * log(ncases) - 2 * log(prior)[k] model.fits[[k]] <- matrix(rep(0, 2 * sum(which[k, ]) + 2), ncol = 2) model.fits[[k]][, 1] <- glim.out$coef # CTV 7/95 model.fits[[k]][, 2] <- sqrt(diag(glim.out$var)) } # for loop } else { # ncol < 2 nmod <- switch(ncol(x), 2, 4) dev <- df <- bic <- label <- rep(0, nmod) model.fits <- as.list(rep(0, nmod)) which <- matrix(c(F, T, F, T, F, F, T, T), nmod, nmod/2) size <- c(0, 1, 1, 2)[1:nmod] names.arg <- dimnames(x)[[2]] if(is.null(names.arg)) names.arg <- c(as.character(1:9), LETTERS, letters)[1: ncol(x)] sep <- if(all(nchar(names.arg) == 1)) "" else "," nobs <- length(y) ncases <- sum(n) prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), byrow = T) prior <- apply(which * prior.mat + (!which) * (1 - prior.mat), 1, prod) for(k in 1:nmod) { if(sum(which[k, ]) == 0) glim.out <- glim(rep(1, nobs), y, n, error = "binomial", link = "logit", intercept = F, scale = scale.est) else { x.mat <- matrix(x[, which[k, ]], nrow = nobs, dimnames = list(NULL, names.arg[which[k, ]]) ) glim.out <- glim(x.mat, y, n, error = "binomial", link = "logit", scale = scale.est ) } dev[k] <- glim.out$deviance[2] df[k] <- glim.out$df[2] bic[k] <- dev[k]/scale.est - df[k] * log(ncases) - 2 * log(prior)[k] label[k] <- paste(names.arg[which[k, ]], collapse = sep) if(k == 1) label[k] <- "NULL" model.fits[[k]] <- matrix(rep(0, 2 * sum(which[k, ]) + 2), ncol = 2) model.fits[[k]][, 1] <- glim.out$coef model.fits[[k]][, 2] <- sqrt(diag(glim.out$var)) } # for loop within else } occam <- bic - min(bic) < 2 * log(OR) dev <- dev[occam] df <- df[occam] size <- size[occam] label <- label[occam] which <- matrix(which[occam, ,drop=F], nrow = sum(occam)) bic <- bic[occam] model.fits <- model.fits[occam] # CTV 7/95 prior <- prior[occam] postprob <- exp(-0.5 * (bic - min(bic)))/sum(exp(-0.5 * (bic - min(bic) ))) ### Order models in descending order of posterior probability order.bic <- order(bic, size, label) dev <- dev[order.bic] df <- df[order.bic] size <- size[order.bic] label <- label[order.bic] which <- matrix(which[order.bic, ,drop=F ], nrow = sum(occam)) bic <- bic[order.bic] prior_prior[order.bic] postprob <- postprob[order.bic] model.fits <- model.fits[order.bic] # CTV 7/95 ### Apply the second rule to eliminate models with better ones nested within them. if(strict) { nmod <- length(bic) occam <- rep(T, nmod) for(k in (2:nmod)) { for(j in (1:(k - 1))) { which.diff <- which[k, ] - which[j, ] if(all(which.diff >= 0)) occam[k] <- F } } dev <- dev[occam] df <- df[occam] size <- size[occam] label <- label[occam] which <- which[occam, ,drop=F ] bic <- bic[occam] prior <- prior[occam] postprob <- postprob[occam] postprob <- postprob/sum(postprob) model.fits <- model.fits[occam] # CTV 7/95 } ## rescale so bic for null model = 0 CTV 6/96 # logprior.null <- sum ( log (1-prior.param) ) bic <- bic + 2 * log(prior) - (glim.out$dev[1]/scale.est - glim.out$df[1]*log(ncases)) ### Calculate Pr[beta_i != 0 | D] for each coefficient beta_i ## revised to allow for NULL only, CTV 7/95 nmod <- length(bic) if(nmod == 1 & size[1] == 0) { probne0 <- 100 Ebi <- EbiMk <- 0 SDbi <- sebiMk <- 0 } else { probne0 <- round(100 * t(which) %*% as.matrix(postprob), 1) nvar <- length(x[1, ]) Ebi <- rep(0, nvar) SDbi <- rep(0, nvar) EbiMk <- matrix(rep(0, nmod * nvar), nrow = nmod) sebiMk <- matrix(rep(0, nmod * nvar), nrow = nmod) for(i in (1:nvar)) { if(sum(which[, i] != 0)) { for(k in (1:nmod)) { if(which[k, i] == T) { ##Find position of beta_i in model k pos <- sum(which[k, (1:i)]) EbiMk[k, i] <- model.fits[[k]][(pos + 1), 1 ] sebiMk[k, i] <- model.fits[[k]][(pos + 1), 2] } } Ebi[i] <- postprob %*% EbiMk[, i] SDbi[i] <- sqrt(postprob%*%(sebiMk[,i]^2) + postprob%*%((EbiMk[, i]-Ebi[i])^2)) } } } list(postprob = postprob, label = label, deviance = dev, bic = round(bic,2), prior.param=prior.param, prior.model.weights=prior/prior.weight.denom, npar = (size), which = which, probne0 = c(probne0), postmean = Ebi, postsd = SDbi, mle = EbiMk, se = sebiMk, scale = scale.est ) }