# R code for chapter 60 Logistic regression: a binary outcome
# Book details:
# Health Science Statistics using R and R Commander
# by Robin Beaumont
# Paperback - ISBN 9781907904318
# http://www.amazon.co.uk/Health-Science-Statistics-using-Commander/dp/190790431X
# or direct from publisher at: 
# http://www.scionpublishing.com/
# book website: http://www.robin-beaumont.co.uk/rbook
###########################################################
#
############# first simple logistic regression

# data from my website:
# treadmill <- 
# read.delim("http://www.robin-beaumont.co.uk/virtualclassroom/book2data/treadmill.dat", header=TRUE)
# or local data ignore:
# treadmill <- 
#  read.table("D:/web_sites_mine/HIcourseweb new/book2data/treadmill.dat", 
#  header=TRUE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE)
# or generate the data on the fly:
time <- c(1014, 684, 810, 990, 840, 978, 1002, 1110, 864, 636, 638, 708, 786, 600, 750, 594, 750)
chd_status <- c("healthy","healthy","healthy","healthy","healthy","healthy","healthy","healthy",
"diseased","diseased","diseased","diseased","diseased","diseased","diseased","diseased","diseased")
# now put them into a dataframe and give it a name
treadmill<-data.frame(cbind(time, chd_status))
str(treadmill)
levels(treadmill$chd_status) <- c(0,1) 
treadmill$chd_status
str(treadmill$chd_status)
str(treadmill)
#########
######  if y variable imported as a factor (using treadmill2 file)
##### the Hosmer-Lemeshow test only works with 0 and 1 values in y
##### whereas R seems to give the levels 1 and 2 automatically
##   so need to change them
levels(treadmill$chd_status)
levels(treadmill$chd_status) <- c(0,1) 
treadmill$chd_status
str(treadmill$chd_status)
#######  end if

names(treadmill)
######## basic summary stats 
library(lattice, pos=4)
xyplot(chd_status ~ time, type="p", pch=16, auto.key=list(border=TRUE), 
  par.settings=simpleTheme(pch=16), scales=list(x=list(relation='same'), 
  y=list(relation='same')), data=treadmill, add=TRUE)
 
###  above generated by R commander, following does same thing and allows adding 
# add a vertical line showing mean time
plot(chd_status ~ time,data=treadmill)
abline(v=809) 
 ############# binary outcome comparisons 
 boxplot( time ~ chd_status,  data=treadmill)
 summary(treadmill)
 #     time          chd_status    
 #Min.   : 594.0   Min.   :0.0000  
#1st Qu.: 684.0   1st Qu.:0.0000  
# Median : 786.0   Median :1.0000  
#Mean   : 809.1   Mean   :0.5294  
#3rd Qu.: 978.0   3rd Qu.:1.0000  
#Max.   :1110.0   Max.   :1.0000  
#######
tapply(treadmill$time, treadmill$chd_status, mean)
   #    0        1 
# 928.5000 702.8889 
tapply(treadmill$time, treadmill$chd_status, length)
# 0 1 
# 8 9 

# While above shows number of healthy / diseased
# be useful to see number disease / healthy split by mean of time on treadmill
# This requires:
# Add an time category: 
# first specify break point
themean <- mean(treadmill$time)
# cut() need one more break than levels you wish to create, want two groups so need 3 levels
# -Inf and + Inf are universal constants in R meaning negative and positive infinity
temp_data <- cbind(treadmill, time_category= cut(treadmill$time, breaks= c(-Inf, themean, +Inf ), include.lowest = TRUE))
temp_data
# now give the levels sensible names
levels(temp_data$time_category) <- c("<mean", ">mean")
temp_data
## Calculate number per time category  
with(temp_data, table(chd_status, time_category)) 
# or
"chd_status"
table(temp_data$chd_status, temp_data$time_category)
#  the following produces a error:
# Error in table(chd_status, time_category, data = temp_data) : 
#   object 'chd_status' not found
# because you can' use the data= argument in table() hence the with option above
table(chd_status, time_category, data=temp_data)
##########################
## now model creation
#########################
# create the mode specifying the the errors follow a binomial pdf
chd_model1<- glm(chd_status ~ time, family=binomial(), data=treadmill)
summary(chd_model1)
# see estimation history using the trace=TRUE option
chd_model2<- glm(chd_status ~ time, family=binomial(),  trace = TRUE, data=treadmill)
summary(chd_model2) 
# set tolerance up  - forces more iterations 
chd_model3<- glm(chd_status ~ time, family=binomial(), trace = TRUE, epsilon = 1e-14, data=treadmill)
summary(chd_model3)   
###### confidence intervals
confint(chd_model1) # this also now produces the exponentiated values as well
plot(confint(chd_model1))
exp(confint(chd_model1))
exp(confint(themodel, parm="time"))
########## AIC
AIC(chd_model1)
# same as:  
chd_model1$null.deviance
chd_model1$deviance
chd_model1$df.residual 
################################################
# A function to do the Hosmer-Lemeshow test in R.
########### version 1
# R Function is due to Peter D. M. Macdonald, McMaster University.
hosmerlem <-
function (y, yhat, g = 10) 
{
    cutyhat <- cut(yhat, breaks = quantile(yhat, probs = seq(0, 
        1, 1/g)), include.lowest = T)
    obs <- xtabs(cbind(1 - y, y) ~ cutyhat)
    expect <- xtabs(cbind(1 - yhat, yhat) ~ cutyhat)
    chisq <- sum((obs - expect)^2/expect)
    P <- 1 - pchisq(chisq, g - 2)
    c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P)
}
hosmerlem(treadmill$chd_status, fitted(chd_model1))
############ if using factor variable as y:
hosmerlem(as.numeric(as.character(treadmill$chd_status)), fitted(chd_model1))
hosmerlem(as.numeric(treadmill$chd_status), fitted(chd_model1))
########### end if
# gives
#      X^2        Df   P(>Chi) 
# 9.0120881 8.0000000 0.3412772 

########### version 2 Hosmer - Lemeshow - Andy field's version
(chd_model1$null.deviance -  chd_model1$df.residual) / chd_model1$null.deviance

#gives:
# [1] 0.3619233
##################
########### version 3 Hosmer - Lemeshow  in package ResourceSelection
install.packages("ResourceSelection")
library(ResourceSelection)
hoslem.test(treadmill$chd_status, fitted(chd_model1), g = 10)
############ if using factor variable as y:
hoslem.test(as.numeric(as.character(treadmill$chd_status)), fitted(chd_model1), g = 10)
########### end if
# gives
# Hosmer and Lemeshow goodness of fit (GOF) test
#data:  as.numeric(as.character(treadmill$chd_status)), fitted(chd_model1)
# X-squared = 9.0121, df = 8, p-value = 0.3413

################### version 4 rms package (updated Design package)
#### for details see:
# Hosmer DW, Hosmer T, le Cessie S, Lemeshow S. 1997 
# A comparison of goodness-of-fit tests for the logistic regression model. 
# Statistics in Medicine, 16, 965-980.
##  this produces an improved version of the Hosmer-Lemeshow test
install.packages("rms")
library(rms)

chd_model1_lrm<- lrm(chd_status ~ time, x=TRUE, y=TRUE,  data=treadmill)
chd_model1_lrm
resid(chd_model1_lrm, 'gof')

# gives
#Sum of squared errors     Expected value|H0                    SD 
#            2.0287961             1.9722895             0.1408183 
#                    Z                     P 
#            0.4012733             0.6882189 
############################
################### version 5 MKmisc package 
install.packages("MKmisc")
library(MKmisc)

HLgof.test(fit = fitted(chd_model1), obs = treadmill$chd_status)
############ if using factor variable as y:
HLgof.test(fit = fitted(chd_model1), obs = as.numeric(as.character(treadmill$chd_status)))
########### end if
# gives
#$C
#        Hosmer-Lemeshow C statistic
# data:  fitted(chd_model1) and treadmill$chd_status
# X-squared = 9.0121, df = 8, p-value = 0.3413
#
#$H
#        Hosmer-Lemeshow H statistic
# data:  fitted(chd_model1) and treadmill$chd_status
# X-squared = 4.7516, df = 8, p-value = 0.7838
#
####################################### end of Hosmer-Lemeshow section ###
### to obtain odds ratios and confidence intervals
#summary(chd_model1)
# does not give the odds in the above output only the log odds
# therefore need to exponentiate the value using exp
exp(coef(chd_model1))
exp(coef(chd_model1)["time"])
# also need the ci interval using the confint command
exp(confint(chd_model1, parm="time"))
# now get the residuals (probabilities)
# and obtain fitted values
fitted1<- predict(chd_model1, type = "response")
fitted1; 
predict_rate <- fitted1

###################
## Classification table
# can create a simple one with no headings:
table(predict_rate>.5, treadmill$chd_status)
#  to add overall + individual row/column lables:
class_table <- table(predict_rate>.5, treadmill$chd_status)
dimnames(class_table) = list(predicted=c("healthy", "CHD"), observed=c("healthy", "CHD"))
class_table
########
##  stats for classification table
#Sensitivity=True positive (tp) rate =  tp/(tp+fn) = 8/(8+1)=88.9% 
#that is given that they are diseased the probability of being correctly classified is 90% 
# =p(pred_diseased|diseased)
100 * ( class_table[2,2] / (class_table[2,2] + class_table[1,2])  )
###
Specificity=True negative (tn) rate = tn/(tn+fp) = 6/(6+2) = 75% = 
#that is given that they are not diseased the probability of being correctly classified is  75% 
# = p(pred not diseased |not diseased)
100 * ( class_table[1,1] / (class_table[1,1] + class_table[2,1])  )
###
#False positive ratio (FPR)  = fp/(tn+fp) = 2/(2+6) =2/8= .25 = 25% 
#that is given that they are not diseased being incorrectly classified as diseased 
# =p(pred_diseased|not_diseased)
100 * ( class_table[2,1] / (class_table[2,1] + class_table[1,1])  )
###
#False negative ratio (FNR) = fn/(fn+tp) = 1/(1+8) = 1/9 = 0.111 = 11%  
#that is given that they are diseased being incorrectly classified as healthy
# = P(pred not diseased |diseased) 
100 * ( class_table[1,2] / (class_table[1,2] + class_table[2,2])  )
 ###
#Checks: specificity = 1-FPR -> 1-.25 = .75 qed;  
#FNR = 1- sensitivity -> 1- .8888 = 0.1111 qed 
##Considering the columns:
#Positive predictive value = tp/(tp+fp) = 8/(8+2) = 80%
100 * ( class_table[2,2] / (class_table[2,2] + class_table[2,1])  )
##
#Negative predictive value = tn/(tn+fn) = 6/(6+1) = 0.8571 = 85.71%
100 * ( class_table[1,1] / (class_table[1,1] + class_table[1,2])  )
########################
#####  pseudo R squared
# now to create the other R square like statistics chd_model1
#the values provided by R are -2logLL's but we need just the Likelihoods
# we can either just use the function logLik() which returns the required value,
# or divide the numbers by 2 and add a minus sign
logLik(chd_model1) 
# chd_model1$deviance or chd_model1[10] = -2logll for current model
# chd_model1$null.deviance or chd_model1[12] = -2logll for null model

Likelihood_final = -chd_model1$deviance/2
Likelihood_init = -chd_model1$null.deviance/2
#### cox_snell method 1
cox_snell<- 1- exp(-(2*(Likelihood_final - Likelihood_init)/length(treadmill$time)))
#### cox_snell method 2 quick and easy version
cox_snell2 <- 1-exp((chd_model1$deviance - chd_model1$null.deviance)/length(treadmill$time))
### nagelkerke's method 1
# Using the formula by companion to the arigosti book for R by Laura A. Thompson,
nagelkerke <-(1 - exp((chd_model1$deviance - chd_model1$null.deviance)/length(treadmill$time)))/(1 - exp( - chd_model1$null.deviance/length(treadmill$time))) 
### nagelkerke's method 2 Using the formula in Andy field's book p.269 
nagelkerke1 <- cox_snell/(1-exp(2*(Likelihood_init)/length(treadmill$time)))
cat("Likelihood_final:  ", Likelihood_final , "\n" ) 
cat("Likelihood_init:   ", Likelihood_init, "\n")
cat("cox_snell:         ", cox_snell, "\n") 
cat("cox_snell2:        ", cox_snell2, "\n")
cat("nagelkerke:        ", nagelkerke, "\n")
cat("nagelkerke1:       ", nagelkerke1, "\n")
###############
################## plot of observed versus predicted
plot (treadmill$time, treadmill$chd_status, xlab= "time (secs)", ylab= "probability of chd")
curve(predict(chd_model1, data.frame(time = x), type="response"), add=TRUE, col="red")
## above works
######################################################
#####################################################
## multiple regression example  prostate stevens dataset stevens multivariate book p.149
prostate_stevens <- read.delim("http://www.robin-beaumont.co.uk/virtualclassroom/book2data/prostate_brown_1980.dat", header=TRUE)
# or local data ignore:
# prostate_stevens <- 
#  read.table("D:/web_sites_mine/HIcourseweb new/book2data/prostate_brown_1980.dat",
#   header=TRUE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE)
names(prostate_stevens)
summary(prostate_stevens)

prostate1 <- glm(Nodes ~ Acid + Age + 
Grade + Stage + X.ray, family=binomial(logit), data=prostate_stevens)
summary(prostate1)
confint(prostate1, level=0.95, type="LR")

prostate2 <- glm(Nodes ~ Stage + X.ray, family=binomial(logit), data=prostate_stevens)
summary(prostate2)
anova(prostate2, prostate1, test="Chisq")
#### tables of counts
prostate_stevens$factor_X.ray <- as.factor(prostate_stevens$X.ray)
prostate_stevens$factor_Stage <- as.factor(prostate_stevens$Stage)
prostate_stevens$factor_Nodes <- as.factor(prostate_stevens$Nodes)

## not necessary to convert to factors when using xtabs r code.
count_table <- xtabs(~factor(Nodes)+ factor(Stage), data=prostate_stevens)
count_table
# equivalent works:
count_table1 <- xtabs(~Nodes+ Stage, data=prostate_stevens)
count_table1
count_table2 <- xtabs(~Nodes+ X.ray, data=prostate_stevens)
count_table2
count_table3 <- xtabs(~Nodes+ Stage + X.ray, data=prostate_stevens)
count_table3
chi_test1 <- chisq.test(count_table1)
chi_test1
chi_test2 <- chisq.test(count_table2)
chi_test2
chi_test3 <- chisq.test(count_table3)
chi_test3

##### predicted values
prostate_stevens$fitted.prostate2 <- fitted(prostate2)
prostate_stevens$residuals.prostate2 <- residuals(prostate2)
prostate_stevens$rstudent.prostate2 <- rstudent(prostate2)
prostate_stevens$hatvalues.prostate2 <- hatvalues(prostate2)
prostate_stevens$cooks.distance.prostate2 <- cooks.distance(prostate2)
prostate_stevens$obsNumber <- 1:nrow(prostate_stevens)
str(prostate_stevens)

##########################################################################################
## Classification table
# can create a simple one with no headings:
table(prostate_stevens$fitted.prostate2>.5, prostate_stevens$Nodes)
#  to add overall + individual row/column lables:
class_table <- table(prostate_stevens$fitted.prostate2>.5, prostate_stevens$Nodes)
dimnames(class_table) = list(predicted=c("no nodal", "nodal"), observed=c("no nodal", "nodal"))
class_table
#############################################################
############# comparing results with the dichotimosed variables in the boot package
# install.packages("boot")
# You can either load the boot package and the dataset using the following R code 
# or the R Commander menu options shown opposite.
library(boot)
data(nodal, package="boot")
names(nodal)
# [1] "m"     "r"     "aged"  "stage" "grade" "xray"  "acid" 
# r = nodal

prostate1_binary <- glm(r ~ acid + aged + 
grade + stage + xray, family=binomial(logit), data=nodal)
summary(prostate1_binary)

prostate2_binary <- glm(r ~ stage + xray, family=binomial(logit), data=nodal)
summary(prostate2_binary)

########### predicted curve not possible for fianl version as no continuous predictors
curve(predict(prostate2, data.frame(time = x), type="response"), add=TRUE, col="red")
##############################################################
######### using tables of counts

###################################
# example below using proportions#
# adaptation of treadmill example and http://yatani.jp/HCIstats/LogisticRegression below

time <- c(450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000)
 chd <- c(80,  77, 76, 70, 82, 40, 32,  34, 30, 21, 13, 2)
total <- c(90, 95, 96, 88, 134, 50, 40, 60, 84, 96, 121, 198)
normal <- (total - chd)
thedata<- cbind( chd, normal, total,  time)
thedata<- data.frame(thedata)
str(thedata)
#can also see the observed proportion
chd/total
# [1] 0.88888889 0.81052632 0.79166667 0.79545455 0.61194030 0.80000000 0.80000000 0.56666667
# [9] 0.35714286 0.21875000 0.10743802 0.01010101
# create the two column reponse vector; not really necesary 
testresult <- cbind(chd, normal)
result <- glm(testresult ~ time, family = binomial)
summary(result)
# the on the fly method
result2 <- glm( cbind(chd, total - chd) ~ time, 
family = binomial, data= thedata)
summary(result2)

## adapted to current situation
predict_rate <- predict(result, type="response")
predict_rate
# first draw observed proportions from dataset
plot(time, chd/total, cex=1.5, xlim = c(400, 1300) )
# now add the proportions predicted from the model to the plot
points(time, predict_rate, pch=3, col="red", cex=1.5)
# now to draw the smoothed line
x <- seq(400, 1200, 50)  # create a vector of values 400 to 1200 at intervals of 10
y <- 1 - (1 / (1 +   exp(result$coef[1] + result$coef[2]*x)))
#  for the probability of being healthy
# y <- (1 / (1 +   exp(result$coef[1] + result$coef[2]*x)))
lines(x, y, col="blue", lwd=2)
############

# from http://yatani.jp/HCIstats/LogisticRegression 
#
 Time <- c(10,20,30,40,50,60,10,20,30,40,50,60)
 Success <- c(10,23,40,70,82,96,12,20,35,58,76,90)
Failure <- 100 - Success
GameResult <- cbind(Success, Failure)
result <- glm(GameResult ~ Time, family = binomial)
summary(result)
predict_rate <- predict(result, type="response")
predict_rate
plot(Time, Success/100, cex=1.5)
points(Time, predict_rate, pch=3, col="red", cex=1.5)
x <- seq(10, 60, 1)
y <- 1 / (1 + exp(3.142800 - 0.091572*x))
lines(x, y, col="blue", lwd=2)

##########################################
# Example of using a table to extract data from
# first create the gender factor
gender=rep(c("Male","Female"), c(6,6))
# now the department factor
dept=rep(LETTERS[1:6],2)
# now create the two columns of counts of admitted / rejects
yes=c(512,353,120,138,53,22,89,17,202,131,94,24)
no=c(313,207,205,279,138,351,19,8,391,244,299,317)

# now put them in a dataframe

ucb_dataframe = data.frame(gender, dept, yes, no)

ucb_model1 = glm(cbind(yes,no) ~ gender*dept, family=binomial(logit), data=ucb_dataframe)

summary(ucb_model1)


#########################################
#######################################
######### below development work
plot (treadmill$time, treadmill$chd_status, xlab= "time (secs)", ylab= "probability of chd")
# next lines does not work
# predict_rate <- predict(chd_model1,data.frame(time = x), type="response")
# predict_rate
#curve(predict(chd_model1,  type="response"),data.frame(time = x), add=TRUE, col="red")
# curve(predict_rate, col="red")
# end of does not work
######################### individual predicted values
# reminder of what the model looks like:
## chd_model1<- glm(chd_status ~ time, family=binomial(), data=treadmill)
####### first predictions with the data we have: 
predict_rate <- predict(chd_model1, type="response")
predict_rate
####### now predictions with some new data: 
newtime <- c(500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100)
newdata <- data.frame(time= newtime) # the predict function demands that the data is in a dataframe
new_time_predicts <- predict(chd_model1, newdata, type="response")
new_time_predicts 
#use a loop for the values
newtime <- seq(from=500, to=1100, by=50)
# make it look better:
predictions <- data.frame(time=newtime, "prob of chd"= new_time_predicts)
predictions
############################ confidence intervals for individual values
# taken from http://stackoverflow.com/questions/14423325/confidence-intervals-for-predictions-from-logistic-regression 
###
#call predict() with type = "link", and
#call predict() with se.fit = TRUE.
new_time_predicts <- predict(chd_model1, newdata, type = "link", se.fit = TRUE)
#The confidence interval is then
critval <- 1.96 ## approx 95% CI
upr <- new_time_predicts$fit + (critval * new_time_predicts$se.fit)
lwr <- new_time_predicts$fit - (critval * new_time_predicts$se.fit)
fit <- new_time_predicts$fit
upr; lwr; fit
# Now for fit, upr and lwr we need to apply the inverse of the link function to them.
fit2 <- chd_model1$family$linkinv(fit)
upr2 <- chd_model1$family$linkinv(upr)
lwr2 <- chd_model1$family$linkinv(lwr)
lwr2;  fit2;  upr2
#########################
#Possible alternative, to be developed:
pred <- predict(y.glm, newdata= something, se.fit=TRUE) 
ci <- matrix( c(pred$fit + 1.96 * pred$se.fit, pred$fit - 1.96 * 
pred$se.fit), ncol=2 ) 
lines( something, plogis( ci[,1] ) ) 
lines( something, plogis( ci[,2] ) ) 
######################  now add the ci's to the plot
plot (treadmill$time, treadmill$chd_status, xlab= "time (secs)", ylab= "probability of chd")
curve(predict(chd_model1, data.frame(time = x), type="response"), add=TRUE, col="red")
lines(newtime, upr2, col="blue",lty=2, lwd=1)
lines(newtime, lwr2, col="red", lty=3, lwd=1)
# lines(newtime, fit2, col="yellow", lwd=1)
#################
######################## Bubble plot (see everitt & holthorn 2006 p.98)
# need two independent variables!
###################### mpower analysis
need z score of 
scale(treadmill$time)