[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Much awaited magnificent tour d' logit



Download the data file and the command file, put in same directory.

If there are nasty logit tricks to be known, this is the place to find them. Beautiful tables, beautiful graphs.

--
Paul E. Johnson                       email: pauljohn_AT_ku.edu
Dept. of Political Science            http://lark.cc.ku.edu/~pauljohn
1541 Lilac Lane, Rm 504
University of Kansas                  Office: (785) 864-9086
Lawrence, Kansas 66044-3177           FAX: (785) 864-5700
#The R Code, based on Ian Ostrander's project:

load("anes2004.rda")


##mydf$partyid[mydf$partyid>6]<-NA

##mydf$religion <- myfile$P045174a
##mydf$religion[mydf$religion>5]<-NA
#coded 1-5 in decreasing religiosity

##mydf$income <- myfile$P045180x
##mydf$income[mydf$income>7]<-NA

##kb<-myfile$P045049a

#Recoding voteKer (Kerry v. Bush vote)
##table(kb)
##mydf$kbvote<-kb
##mydf$kbvote[kb>3]<-NA
##table(mydf$kbvote)
##mydf$kbvote[kb==3]<-0
##table(mydf$kbvote)
#kbvote is coded K(1),B(0)

#Logistic Regression of model
mylog<-glm (kbvote~partyid+income +religion , family=binomial, data=mydf,model=T)
summary(mylog)


# Make some nice pictures!

newdf <- expand.grid(partyid=0:6, religion=1:5, income=1:7)
p1 <- predict(mylog, newdata=newdf, se.fit=T,type="response")
newdf$p1Rfit <- p1$fit
newdf$p1Rse <- p1$Rse
p1 <- predict(mylog, newdata=newdf, se.fit=T,type=c("link"))
newdf$p1Lfit <- p1$fit
newdf$p1Lse <- p1$Rse              
rm(p1)



with(mylog$model,
     plot(partyid, kbvote, main="Logistic Regression",xlab="Party ID: 7 point scale",ylab="Vote For Kerry",pch=16) )

# Add lines one a time, for each value of religion. Income is set at 4
# for all of them.  Can automate this (as shown below).

 with(subset(newdf, newdf$religion==1 & newdf$income==4),
      lines(partyid,p1Rfit,col=religion,lty=religion))
 with(subset(newdf, newdf$religion==2 & newdf$income==4),
      lines(partyid,p1Rfit,col=religion,lty=religion))
 with(subset(newdf, newdf$religion==3 & newdf$income==4),
      lines(partyid,p1Rfit,col=religion,lty=religion))
 with(subset(newdf, newdf$religion==4 & newdf$income==4),
      lines(partyid,p1Rfit,col=religion,lty=religion))
 with(subset(newdf, newdf$religion==4 & newdf$income==4),
      lines(partyid,p1Rfit,col=religion,lty=religion))

legend(locator(1), paste("religion:", 1:5), lty=min(newdf$religion):max(newdf$religion), col=min(newdf$religion):max(newdf$religion))
 text(locator(1), paste("income=",4))


# matplot would automatically create that plot if you re-arranged your data.
# 
# You don't need matplot, though.  This automates the previous lining exercise,
# doing one at a time.
# for ( i in unique(newdf$religion))
# {
#   with(subset(newdf, newdf$religion==i & newdf$income==4),
#       lines(partyid, p1fit, type="l", col=i,lty=i))
# }


# Following does work because the name of the data frame in the by statement is
# not able to refer to the data frame created by with

#with( subset(newdf, newdf$income==4),
#  by ( ??name of df??, as.factor(newdf$religion), function(thedf) lines(thedf$partyid, thedf$p1fit, type="l", col=thedf$religion)))


library(gregmisc)

aggtableR <- with(subset(newdf,income==4),
                  aggregate.table(p1Rfit, by1=partyid, by2=religion, sum))
aggtableR


aggtableL <- with(subset(newdf,income==4),
                  aggregate.table(p1Lfit, by1=partyid, by2=religion, sum))
aggtableL




###------------------------------------------------



#Should partyID be treated as a numerical variable?  Perhaps instead it should use

mydf$partyFac <- factor(mydf$partyid)

# or an ordered factor:

mydf$partyOrd <- ordered(mydf$partyid)

mylog2 <-glm(kbvote~partyFac+income+religion,family=binomial, data=mydf, model=T)
summary(mylog2)

anova(mylog, mylog2, test="Chisq")

mylog3 <-glm(kbvote~partyOrd+income+religion,family=binomial, data=mydf, model=T)
summary(mylog3)

anova(mylog, mylog3, test="Chisq")


# It would be nice to see a picture of the ordinal predictor's
# impact.  The Design library's default plot method will show the
# prediction in terms of the log-odds, but I want probabilities as well
# Plus, I might be stranded on a deserted isle without Design someday
# and want to be able to do this without it.


# Since this dataframe has only 3 predictors, it is quite easy to
# use expand.grid to create a new data frame for which we want predictions

# First, extract the model data from the fitted model object.The advantage is
# that all missings have been purged by the glm. Lets go on being
# incorrect and tread income as if it were numeric. We'll just use
# its average to calculate predictions.  We
# estimated religion as a numeric variable, but should not have.
# Nevertheless, we'll just use its observed levels in the predicting dataset

mod3df <- mylog3$model

#just set income to 4.
# when we made newdf above, we assumed we knew the ranges for the variables.
# the approach here is slightly more general, using "levels" and "unique" to
# get values for partyOrd and religion, respectively.
newdf2 <- expand.grid (partyOrd=levels(mod3df$partyOrd),
                      income = 4,
                      religion=sort(unique(mod3df$religion)))

p1 <- predict(mylog3, newdata= newdf2, se.fit=T, type="link")

newdf2$pred3L <- p1$fit
newdf2$se3L <- p1$se.fit

p1 <- predict(mylog3, newdata= newdf2, se.fit=T, type="response")
newdf2$pred3R <-p1$fit
newdf$se3R <- p1$se.fit


#To make tables of predicted probabilities, sometimes it is nice to use
#"aggregate.table" from gregmisc.

library(gregmisc)

aggtable3L <- aggregate.table(newdf2$pred3L, by1=newdf2$partyOrd, by2=newdf2$religion, sum)

aggtable3L


aggtable3R <- aggregate.table(newdf2$pred3R, by1=newdf2$partyOrd, by2=newdf2$religion, sum)

aggtable3R



# now make a plot showing vote/partyID predictions for each value of religion
matplot(levels(newdf2$partyOrd), aggtable3R, type="l")


# What's that telling me? It says, well, religion is not so important
# and that the effect of party is 'mostly negative' but not entirely linear.

# So let's just set religion to its modal value. To find that, observe

table(mod3df$religion)

# the mode is 1

# that calculation can be automated:

tab <- table(mod3df$religion)
mode.religion <-  rownames(tab)[which.max(tab)]

newdf2 <- expand.grid (partyOrd = levels(mod3df$partyOrd),
                       income = 4,
                       religion = as.numeric(mode.religion))


pred3L <- predict (mylog3, newdata= newdf2, type="link")

par(mfrow=c(2,1))

plot(newdf2$partyOrd,pred3L,ylab="ln[p/(1-p)]")



pred3R <- predict(mylog3, newdata= newdf2, type="response")
plot(newdf2$partyOrd,pred3R,ylim=c(-.2,1.2))

points(mod3df$partyOrd, mod3df$kbvote, pch=16, cex=0.4)

par(mfrow=c(1,1))


library(Design)

mylog2lrm <-lrm(kbvote~partyFac+income+religion, data=mydf)
mylog2lrm
dd <- datadist(mydf)
options(datadist="dd")
summary(mylog2lrm)
par(ask=T)
plot(mylog2lrm)


mylog3lrm <-lrm(kbvote~partyOrd+income+religion)

dd <- datadist(partyOrd, partyFac, partyid, income, religion)
options(datadist="dd")
summary(mylog3lrm)
par(ask=T)
plot(mylog3lrm)

Attachment: anes2004.rda
Description: Binary data