# R code for chapter 60 Logistic regression: a binary outcome # Book details: # Health Science Statistics using R and R Commander # by Robin Beaumont # Paperback - ISBN 9781907904318 # http://www.amazon.co.uk/Health-Science-Statistics-using-Commander/dp/190790431X # or direct from publisher at: # http://www.scionpublishing.com/ # book website: http://www.robin-beaumont.co.uk/rbook ########################################################### # ############# first simple logistic regression # data from my website: # treadmill <- # read.delim("http://www.robin-beaumont.co.uk/virtualclassroom/book2data/treadmill.dat", header=TRUE) # or local data ignore: # treadmill <- # read.table("D:/web_sites_mine/HIcourseweb new/book2data/treadmill.dat", # header=TRUE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE) # or generate the data on the fly: time <- c(1014, 684, 810, 990, 840, 978, 1002, 1110, 864, 636, 638, 708, 786, 600, 750, 594, 750) chd_status <- c("healthy","healthy","healthy","healthy","healthy","healthy","healthy","healthy", "diseased","diseased","diseased","diseased","diseased","diseased","diseased","diseased","diseased") # now put them into a dataframe and give it a name treadmill<-data.frame(cbind(time, chd_status)) str(treadmill) levels(treadmill$chd_status) <- c(0,1) treadmill$chd_status str(treadmill$chd_status) str(treadmill) ######### ###### if y variable imported as a factor (using treadmill2 file) ##### the Hosmer-Lemeshow test only works with 0 and 1 values in y ##### whereas R seems to give the levels 1 and 2 automatically ## so need to change them levels(treadmill$chd_status) levels(treadmill$chd_status) <- c(0,1) treadmill$chd_status str(treadmill$chd_status) ####### end if names(treadmill) ######## basic summary stats library(lattice, pos=4) xyplot(chd_status ~ time, type="p", pch=16, auto.key=list(border=TRUE), par.settings=simpleTheme(pch=16), scales=list(x=list(relation='same'), y=list(relation='same')), data=treadmill, add=TRUE) ### above generated by R commander, following does same thing and allows adding # add a vertical line showing mean time plot(chd_status ~ time,data=treadmill) abline(v=809) ############# binary outcome comparisons boxplot( time ~ chd_status, data=treadmill) summary(treadmill) # time chd_status #Min. : 594.0 Min. :0.0000 #1st Qu.: 684.0 1st Qu.:0.0000 # Median : 786.0 Median :1.0000 #Mean : 809.1 Mean :0.5294 #3rd Qu.: 978.0 3rd Qu.:1.0000 #Max. :1110.0 Max. :1.0000 ####### tapply(treadmill$time, treadmill$chd_status, mean) # 0 1 # 928.5000 702.8889 tapply(treadmill$time, treadmill$chd_status, length) # 0 1 # 8 9 # While above shows number of healthy / diseased # be useful to see number disease / healthy split by mean of time on treadmill # This requires: # Add an time category: # first specify break point themean <- mean(treadmill$time) # cut() need one more break than levels you wish to create, want two groups so need 3 levels # -Inf and + Inf are universal constants in R meaning negative and positive infinity temp_data <- cbind(treadmill, time_category= cut(treadmill$time, breaks= c(-Inf, themean, +Inf ), include.lowest = TRUE)) temp_data # now give the levels sensible names levels(temp_data$time_category) <- c("mean") temp_data ## Calculate number per time category with(temp_data, table(chd_status, time_category)) # or "chd_status" table(temp_data$chd_status, temp_data$time_category) # the following produces a error: # Error in table(chd_status, time_category, data = temp_data) : # object 'chd_status' not found # because you can' use the data= argument in table() hence the with option above table(chd_status, time_category, data=temp_data) ########################## ## now model creation ######################### # create the mode specifying the the errors follow a binomial pdf chd_model1<- glm(chd_status ~ time, family=binomial(), data=treadmill) summary(chd_model1) # see estimation history using the trace=TRUE option chd_model2<- glm(chd_status ~ time, family=binomial(), trace = TRUE, data=treadmill) summary(chd_model2) # set tolerance up - forces more iterations chd_model3<- glm(chd_status ~ time, family=binomial(), trace = TRUE, epsilon = 1e-14, data=treadmill) summary(chd_model3) ###### confidence intervals confint(chd_model1) # this also now produces the exponentiated values as well plot(confint(chd_model1)) exp(confint(chd_model1)) exp(confint(themodel, parm="time")) ########## AIC AIC(chd_model1) # same as: chd_model1$null.deviance chd_model1$deviance chd_model1$df.residual ################################################ # A function to do the Hosmer-Lemeshow test in R. ########### version 1 # R Function is due to Peter D. M. Macdonald, McMaster University. hosmerlem <- function (y, yhat, g = 10) { cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 1, 1/g)), include.lowest = T) obs <- xtabs(cbind(1 - y, y) ~ cutyhat) expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat) chisq <- sum((obs - expect)^2/expect) P <- 1 - pchisq(chisq, g - 2) c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P) } hosmerlem(treadmill$chd_status, fitted(chd_model1)) ############ if using factor variable as y: hosmerlem(as.numeric(as.character(treadmill$chd_status)), fitted(chd_model1)) hosmerlem(as.numeric(treadmill$chd_status), fitted(chd_model1)) ########### end if # gives # X^2 Df P(>Chi) # 9.0120881 8.0000000 0.3412772 ########### version 2 Hosmer - Lemeshow - Andy field's version (chd_model1$null.deviance - chd_model1$df.residual) / chd_model1$null.deviance #gives: # [1] 0.3619233 ################## ########### version 3 Hosmer - Lemeshow in package ResourceSelection install.packages("ResourceSelection") library(ResourceSelection) hoslem.test(treadmill$chd_status, fitted(chd_model1), g = 10) ############ if using factor variable as y: hoslem.test(as.numeric(as.character(treadmill$chd_status)), fitted(chd_model1), g = 10) ########### end if # gives # Hosmer and Lemeshow goodness of fit (GOF) test #data: as.numeric(as.character(treadmill$chd_status)), fitted(chd_model1) # X-squared = 9.0121, df = 8, p-value = 0.3413 ################### version 4 rms package (updated Design package) #### for details see: # Hosmer DW, Hosmer T, le Cessie S, Lemeshow S. 1997 # A comparison of goodness-of-fit tests for the logistic regression model. # Statistics in Medicine, 16, 965-980. ## this produces an improved version of the Hosmer-Lemeshow test install.packages("rms") library(rms) chd_model1_lrm<- lrm(chd_status ~ time, x=TRUE, y=TRUE, data=treadmill) chd_model1_lrm resid(chd_model1_lrm, 'gof') # gives #Sum of squared errors Expected value|H0 SD # 2.0287961 1.9722895 0.1408183 # Z P # 0.4012733 0.6882189 ############################ ################### version 5 MKmisc package install.packages("MKmisc") library(MKmisc) HLgof.test(fit = fitted(chd_model1), obs = treadmill$chd_status) ############ if using factor variable as y: HLgof.test(fit = fitted(chd_model1), obs = as.numeric(as.character(treadmill$chd_status))) ########### end if # gives #$C # Hosmer-Lemeshow C statistic # data: fitted(chd_model1) and treadmill$chd_status # X-squared = 9.0121, df = 8, p-value = 0.3413 # #$H # Hosmer-Lemeshow H statistic # data: fitted(chd_model1) and treadmill$chd_status # X-squared = 4.7516, df = 8, p-value = 0.7838 # ####################################### end of Hosmer-Lemeshow section ### ### to obtain odds ratios and confidence intervals #summary(chd_model1) # does not give the odds in the above output only the log odds # therefore need to exponentiate the value using exp exp(coef(chd_model1)) exp(coef(chd_model1)["time"]) # also need the ci interval using the confint command exp(confint(chd_model1, parm="time")) # now get the residuals (probabilities) # and obtain fitted values fitted1<- predict(chd_model1, type = "response") fitted1; predict_rate <- fitted1 ################### ## Classification table # can create a simple one with no headings: table(predict_rate>.5, treadmill$chd_status) # to add overall + individual row/column lables: class_table <- table(predict_rate>.5, treadmill$chd_status) dimnames(class_table) = list(predicted=c("healthy", "CHD"), observed=c("healthy", "CHD")) class_table ######## ## stats for classification table #Sensitivity=True positive (tp) rate = tp/(tp+fn) = 8/(8+1)=88.9% #that is given that they are diseased the probability of being correctly classified is 90% # =p(pred_diseased|diseased) 100 * ( class_table[2,2] / (class_table[2,2] + class_table[1,2]) ) ### Specificity=True negative (tn) rate = tn/(tn+fp) = 6/(6+2) = 75% = #that is given that they are not diseased the probability of being correctly classified is 75% # = p(pred not diseased |not diseased) 100 * ( class_table[1,1] / (class_table[1,1] + class_table[2,1]) ) ### #False positive ratio (FPR) = fp/(tn+fp) = 2/(2+6) =2/8= .25 = 25% #that is given that they are not diseased being incorrectly classified as diseased # =p(pred_diseased|not_diseased) 100 * ( class_table[2,1] / (class_table[2,1] + class_table[1,1]) ) ### #False negative ratio (FNR) = fn/(fn+tp) = 1/(1+8) = 1/9 = 0.111 = 11% #that is given that they are diseased being incorrectly classified as healthy # = P(pred not diseased |diseased) 100 * ( class_table[1,2] / (class_table[1,2] + class_table[2,2]) ) ### #Checks: specificity = 1-FPR -> 1-.25 = .75 qed; #FNR = 1- sensitivity -> 1- .8888 = 0.1111 qed ##Considering the columns: #Positive predictive value = tp/(tp+fp) = 8/(8+2) = 80% 100 * ( class_table[2,2] / (class_table[2,2] + class_table[2,1]) ) ## #Negative predictive value = tn/(tn+fn) = 6/(6+1) = 0.8571 = 85.71% 100 * ( class_table[1,1] / (class_table[1,1] + class_table[1,2]) ) ######################## ##### pseudo R squared # now to create the other R square like statistics chd_model1 #the values provided by R are -2logLL's but we need just the Likelihoods # we can either just use the function logLik() which returns the required value, # or divide the numbers by 2 and add a minus sign logLik(chd_model1) # chd_model1$deviance or chd_model1[10] = -2logll for current model # chd_model1$null.deviance or chd_model1[12] = -2logll for null model Likelihood_final = -chd_model1$deviance/2 Likelihood_init = -chd_model1$null.deviance/2 #### cox_snell method 1 cox_snell<- 1- exp(-(2*(Likelihood_final - Likelihood_init)/length(treadmill$time))) #### cox_snell method 2 quick and easy version cox_snell2 <- 1-exp((chd_model1$deviance - chd_model1$null.deviance)/length(treadmill$time)) ### nagelkerke's method 1 # Using the formula by companion to the arigosti book for R by Laura A. Thompson, nagelkerke <-(1 - exp((chd_model1$deviance - chd_model1$null.deviance)/length(treadmill$time)))/(1 - exp( - chd_model1$null.deviance/length(treadmill$time))) ### nagelkerke's method 2 Using the formula in Andy field's book p.269 nagelkerke1 <- cox_snell/(1-exp(2*(Likelihood_init)/length(treadmill$time))) cat("Likelihood_final: ", Likelihood_final , "\n" ) cat("Likelihood_init: ", Likelihood_init, "\n") cat("cox_snell: ", cox_snell, "\n") cat("cox_snell2: ", cox_snell2, "\n") cat("nagelkerke: ", nagelkerke, "\n") cat("nagelkerke1: ", nagelkerke1, "\n") ############### ################## plot of observed versus predicted plot (treadmill$time, treadmill$chd_status, xlab= "time (secs)", ylab= "probability of chd") curve(predict(chd_model1, data.frame(time = x), type="response"), add=TRUE, col="red") ## above works ###################################################### ##################################################### ## multiple regression example prostate stevens dataset stevens multivariate book p.149 prostate_stevens <- read.delim("http://www.robin-beaumont.co.uk/virtualclassroom/book2data/prostate_brown_1980.dat", header=TRUE) # or local data ignore: # prostate_stevens <- # read.table("D:/web_sites_mine/HIcourseweb new/book2data/prostate_brown_1980.dat", # header=TRUE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE) names(prostate_stevens) summary(prostate_stevens) prostate1 <- glm(Nodes ~ Acid + Age + Grade + Stage + X.ray, family=binomial(logit), data=prostate_stevens) summary(prostate1) confint(prostate1, level=0.95, type="LR") prostate2 <- glm(Nodes ~ Stage + X.ray, family=binomial(logit), data=prostate_stevens) summary(prostate2) anova(prostate2, prostate1, test="Chisq") #### tables of counts prostate_stevens$factor_X.ray <- as.factor(prostate_stevens$X.ray) prostate_stevens$factor_Stage <- as.factor(prostate_stevens$Stage) prostate_stevens$factor_Nodes <- as.factor(prostate_stevens$Nodes) ## not necessary to convert to factors when using xtabs r code. count_table <- xtabs(~factor(Nodes)+ factor(Stage), data=prostate_stevens) count_table # equivalent works: count_table1 <- xtabs(~Nodes+ Stage, data=prostate_stevens) count_table1 count_table2 <- xtabs(~Nodes+ X.ray, data=prostate_stevens) count_table2 count_table3 <- xtabs(~Nodes+ Stage + X.ray, data=prostate_stevens) count_table3 chi_test1 <- chisq.test(count_table1) chi_test1 chi_test2 <- chisq.test(count_table2) chi_test2 chi_test3 <- chisq.test(count_table3) chi_test3 ##### predicted values prostate_stevens$fitted.prostate2 <- fitted(prostate2) prostate_stevens$residuals.prostate2 <- residuals(prostate2) prostate_stevens$rstudent.prostate2 <- rstudent(prostate2) prostate_stevens$hatvalues.prostate2 <- hatvalues(prostate2) prostate_stevens$cooks.distance.prostate2 <- cooks.distance(prostate2) prostate_stevens$obsNumber <- 1:nrow(prostate_stevens) str(prostate_stevens) ########################################################################################## ## Classification table # can create a simple one with no headings: table(prostate_stevens$fitted.prostate2>.5, prostate_stevens$Nodes) # to add overall + individual row/column lables: class_table <- table(prostate_stevens$fitted.prostate2>.5, prostate_stevens$Nodes) dimnames(class_table) = list(predicted=c("no nodal", "nodal"), observed=c("no nodal", "nodal")) class_table ############################################################# ############# comparing results with the dichotimosed variables in the boot package # install.packages("boot") # You can either load the boot package and the dataset using the following R code # or the R Commander menu options shown opposite. library(boot) data(nodal, package="boot") names(nodal) # [1] "m" "r" "aged" "stage" "grade" "xray" "acid" # r = nodal prostate1_binary <- glm(r ~ acid + aged + grade + stage + xray, family=binomial(logit), data=nodal) summary(prostate1_binary) prostate2_binary <- glm(r ~ stage + xray, family=binomial(logit), data=nodal) summary(prostate2_binary) ########### predicted curve not possible for fianl version as no continuous predictors curve(predict(prostate2, data.frame(time = x), type="response"), add=TRUE, col="red") ############################################################## ######### using tables of counts ################################### # example below using proportions# # adaptation of treadmill example and http://yatani.jp/HCIstats/LogisticRegression below time <- c(450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000) chd <- c(80, 77, 76, 70, 82, 40, 32, 34, 30, 21, 13, 2) total <- c(90, 95, 96, 88, 134, 50, 40, 60, 84, 96, 121, 198) normal <- (total - chd) thedata<- cbind( chd, normal, total, time) thedata<- data.frame(thedata) str(thedata) #can also see the observed proportion chd/total # [1] 0.88888889 0.81052632 0.79166667 0.79545455 0.61194030 0.80000000 0.80000000 0.56666667 # [9] 0.35714286 0.21875000 0.10743802 0.01010101 # create the two column reponse vector; not really necesary testresult <- cbind(chd, normal) result <- glm(testresult ~ time, family = binomial) summary(result) # the on the fly method result2 <- glm( cbind(chd, total - chd) ~ time, family = binomial, data= thedata) summary(result2) ## adapted to current situation predict_rate <- predict(result, type="response") predict_rate # first draw observed proportions from dataset plot(time, chd/total, cex=1.5, xlim = c(400, 1300) ) # now add the proportions predicted from the model to the plot points(time, predict_rate, pch=3, col="red", cex=1.5) # now to draw the smoothed line x <- seq(400, 1200, 50) # create a vector of values 400 to 1200 at intervals of 10 y <- 1 - (1 / (1 + exp(result$coef[1] + result$coef[2]*x))) # for the probability of being healthy # y <- (1 / (1 + exp(result$coef[1] + result$coef[2]*x))) lines(x, y, col="blue", lwd=2) ############ # from http://yatani.jp/HCIstats/LogisticRegression # Time <- c(10,20,30,40,50,60,10,20,30,40,50,60) Success <- c(10,23,40,70,82,96,12,20,35,58,76,90) Failure <- 100 - Success GameResult <- cbind(Success, Failure) result <- glm(GameResult ~ Time, family = binomial) summary(result) predict_rate <- predict(result, type="response") predict_rate plot(Time, Success/100, cex=1.5) points(Time, predict_rate, pch=3, col="red", cex=1.5) x <- seq(10, 60, 1) y <- 1 / (1 + exp(3.142800 - 0.091572*x)) lines(x, y, col="blue", lwd=2) ########################################## # Example of using a table to extract data from # first create the gender factor gender=rep(c("Male","Female"), c(6,6)) # now the department factor dept=rep(LETTERS[1:6],2) # now create the two columns of counts of admitted / rejects yes=c(512,353,120,138,53,22,89,17,202,131,94,24) no=c(313,207,205,279,138,351,19,8,391,244,299,317) # now put them in a dataframe ucb_dataframe = data.frame(gender, dept, yes, no) ucb_model1 = glm(cbind(yes,no) ~ gender*dept, family=binomial(logit), data=ucb_dataframe) summary(ucb_model1) ######################################### ####################################### ######### below development work plot (treadmill$time, treadmill$chd_status, xlab= "time (secs)", ylab= "probability of chd") # next lines does not work # predict_rate <- predict(chd_model1,data.frame(time = x), type="response") # predict_rate #curve(predict(chd_model1, type="response"),data.frame(time = x), add=TRUE, col="red") # curve(predict_rate, col="red") # end of does not work ######################### individual predicted values # reminder of what the model looks like: ## chd_model1<- glm(chd_status ~ time, family=binomial(), data=treadmill) ####### first predictions with the data we have: predict_rate <- predict(chd_model1, type="response") predict_rate ####### now predictions with some new data: newtime <- c(500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100) newdata <- data.frame(time= newtime) # the predict function demands that the data is in a dataframe new_time_predicts <- predict(chd_model1, newdata, type="response") new_time_predicts #use a loop for the values newtime <- seq(from=500, to=1100, by=50) # make it look better: predictions <- data.frame(time=newtime, "prob of chd"= new_time_predicts) predictions ############################ confidence intervals for individual values # taken from http://stackoverflow.com/questions/14423325/confidence-intervals-for-predictions-from-logistic-regression ### #call predict() with type = "link", and #call predict() with se.fit = TRUE. new_time_predicts <- predict(chd_model1, newdata, type = "link", se.fit = TRUE) #The confidence interval is then critval <- 1.96 ## approx 95% CI upr <- new_time_predicts$fit + (critval * new_time_predicts$se.fit) lwr <- new_time_predicts$fit - (critval * new_time_predicts$se.fit) fit <- new_time_predicts$fit upr; lwr; fit # Now for fit, upr and lwr we need to apply the inverse of the link function to them. fit2 <- chd_model1$family$linkinv(fit) upr2 <- chd_model1$family$linkinv(upr) lwr2 <- chd_model1$family$linkinv(lwr) lwr2; fit2; upr2 ######################### #Possible alternative, to be developed: pred <- predict(y.glm, newdata= something, se.fit=TRUE) ci <- matrix( c(pred$fit + 1.96 * pred$se.fit, pred$fit - 1.96 * pred$se.fit), ncol=2 ) lines( something, plogis( ci[,1] ) ) lines( something, plogis( ci[,2] ) ) ###################### now add the ci's to the plot plot (treadmill$time, treadmill$chd_status, xlab= "time (secs)", ylab= "probability of chd") curve(predict(chd_model1, data.frame(time = x), type="response"), add=TRUE, col="red") lines(newtime, upr2, col="blue",lty=2, lwd=1) lines(newtime, lwr2, col="red", lty=3, lwd=1) # lines(newtime, fit2, col="yellow", lwd=1) ################# ######################## Bubble plot (see everitt & holthorn 2006 p.98) # need two independent variables! ###################### mpower analysis need z score of scale(treadmill$time)