# This code is to accompany Maximum Likelihood Methods Strategies for Social Science, # Michael D. Ward and John S. Ahlquist, 2018, Cambridge University Press, # ISBN 9781107185821. The code is presented as is, but is designed to replicate all of # the material in the book, chapter by chapter. Any comments, suggestions, or problems can # be reported to the authors: michael.don.ward@gmail.com and John Ahlquist jahlquist@ucsd.edu. # These programs were designed to run on a variety of platforms, but currently uses # platform x86_64-apple-darwin15.6.0 # arch x86_64 # os darwin15.6.0 # system x86_64, darwin15.6.0 # version.string R version 3.4.4 (2018-03-15) # nickname Someone to Lean On # We used Rstudio, Version 1.1.447 # This code requires installation of the loaded libraries. rm(list=ls()); gc() library(foreign); library(car); library(arm);library(stargazer) library(boot); library(alr3); library(geepack); library(VGAM) library(xtable);library(verification);library(ROCR);library(DataCombine) library(glmx);library(sandwich);library(lmtest) lfp<-read.dta("binlfp2.dta") set.seed(123) ### Helper function for graphics t_col <- function(color, percent = 50, name = NULL) { # color = color name # percent = % transparency # name = an optional name for the color # Get RGB values for named color rgb.val <- col2rgb(color) # Make new color using input color as base and alpha set by transparency t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3], max = 255, alpha = (100-percent)*255/100, names = name) } ### Table 3.1, in two parts ########################## data.obj<-c("M","yes","no", "M","no","yes", "M","no","no", F,"yes","no", F,"no","no", F,"no","yes", "M","no","yes", "M","no","yes", "M","yes","yes") df<-data.frame(matrix(data.obj,ncol=3,byrow=T)) colnames(df)<-c("gender","employed","voted") print(xtable(df,caption="Table 3.1 part a"), include.rownames=FALSE) data.obj.2<-c("(M, yes)","2","1", "(M, no)","4","3", "(F, yes)","1","0", "(F, no)","2","1") df.2<-data.frame(matrix(data.obj.2,ncol=3,byrow=T)) colnames(df.2)<-c("(Gender, Employed)","Class Size","Voted") print(xtable(df.2,caption="Table 3.1 part b"), include.rownames=FALSE) ###################################################### ### Table 3.2 ######################################## colnames(lfp)<-c("LFP","young.kids","school.kids","age", "college.woman", "college.man", "wage", "income") df.3 <- data.frame(cbind(colnames(lfp),c("1 if in paid labor force; else 0","Number of children younger than 6", "Number of children ages 6 to 18","Age in years","1 if attended college; else 0", "1 if nonwife attended college; else 0","Log of woman's estimated wage rate", "Family income, excluding woman's wages"))) colnames(df.3) <- c("Variable Name","Description") print(xtable(df.3,caption="Table 3.2, variables from Long's Study"), include.rownames=FALSE) ###################################################### ### Table 3.3 ######################################## yk<-lfp$young.kids yk<-lfp$young.kids*0 yk[lfp$young.kids==1]<-1 yk[lfp$young.kids==2]<-2 yk[lfp$young.kids==3]<-3 df.4 <- data.frame(matrix(table(lfp$LFP,yk),nrow=2)) df.4[3,] <- df.4[2,]/df.4[1,] colnames(df.4) <- as.character(c(0:3)); rownames(df.4) <- c("No","Yes","Odds") df.4 colnames(df.4) <- as.character(c(0:3)); rownames(df.4) <- c("No","Yes","Odds") print(xtable(df.4,caption="Table 3.4, Labor Force Participation and the Number of Young Children"), include.rownames=FALSE) ###################################################### *** Table 3.4 ######################################## linprob.fit <- lm(LFP ~ young.kids + school.kids + age + college.woman + wage, data = lfp) #xtable(summary(linprob.fit)) stargazer(linprob.fit,star.cutoffs = NA) ###################################################### *** Figure 3.1 ####################################### # create binary variable lfp$lfpbin<-rep(0,length(lfp$LFP)) lfp$lfpbin[lfp$LFP=="inLF"]<-1 m1<-lm(lfpbin~young.kids+school.kids+age+college.woman+wage,data=lfp) xtable(m1) use.vars<-names(m1$model) #### Figure 2.1 #### #pdf(file = "LPMdiag.pdf", width = 6, height = 3, onefile = TRUE, family = "Times") par(mfrow=c(1,3)) plot(predict(m1),lfp$lfpbin, ylab="LFP", xlab=expression(hat(y)), xlim=c(-0.15,1.15),yaxt="n", las=1,bty="n",pch=19) title(main="Actual vs Predicted", font=1) axis(2, at=c(0,1),labels=c("No", "Yes")) abline(lm(lfp$lfpbin~predict(m1))) plot(m1,which=c(1:2), bty="n",pch=19) par(mfrow=c(1,1)) #dev.off() ###################################################### ### Figure 3.2 ####################################### x<-seq(-5,5,by=.1) #pdf(file = "invlogit.pdf", width = 4, height = 4) mar.default <- c(5,4,4,2) + 0.1 par(mar = mar.default + c(0, 1, 0, 0)) plot(x,plogis(x), type="l",bty="n",lwd=2, xlab = expression(x[i]^T*beta), las=1, mar=c(5, 5, 4, 2) + 0.1, ylab= expression(paste("logi", t^-1, "(", x[i]^T*beta,")" )), main="") segments(0,0,0,.5,lty=3,col="slateblue2") segments(0,.5,-6,.5,lty=3,col="slateblue2") text(.425,.5,"slope = 1/4",col="slateblue4",srt=58,cex=.6) #dev.off() ### Code example 3.2 fit.glm<-glm(lfpbin ~ young.kids + school.kids + age + college.woman + wage , family= binomial ( link = "logit" ) , data = lfp) summary(fit.glm) ###################################################### ### Table 3.5 ####################################### mod.mat<-m1$model mod.mat$college.woman<-recode(mod.mat$college.woman,"'College'=1; 'NoCol'=0 ",as.factor=F) #lfp$hc<-recode(lfp$hc,"'College'=1; 'NoCol'=0 ",as.factor=F) y<-as.vector(mod.mat$lfpbin) x<-as.matrix(mod.mat[,2:ncol(mod.mat)]) #function for optimization of logit binreg<- function(X,y,method="BFGS"){ X<- cbind(1,X) negLL<- function(b,X,y){ p<-as.vector(1/(1+exp(-X %*% b))) - sum(y*log(p) + (1-y)*log(1-p)) } gradient<- function(b,X,y){ p <- as.vector(1/(1+exp(-X %*% b))) - apply(((y - p)*X),2,sum) } results<- optim (rep(0,ncol(X)),negLL,gr=gradient, hessian=T,method=method,X=X,y=y) list(coefficients=results$par,var=solve(results$hessian), deviance=2*results$value, converged=results$convergence==0) } mlebin.fit<-binreg(x,y) #results round(mlebin.fit$coefficients,2) fit.glm <- glm(lfpbin ~ young.kids + school.kids + age + college.woman + wage, family = binomial(link = "logit"), data = lfp) summary(fit.glm, signif.stars=FALSE) #stargazer(fit.glm,star.cutoffs = NA) ###################################################### ### Table 3.6 # Not Provided ###################################################### ### Table 3.7 vcov(fit.glm) # from glm() xtable(vcov(fit.glm),caption="Variance Covariance Matrix from the MLE",digits=3) ### Table 3.8 ######################################## repl.df<-read.dta("coburn.dta") dim(repl.df) simple.logit.fit<-glm(data=repl.df,family="binomial", voteno ~ democrat) repl.logit.fit<-update(simple.logit.fit, .~. + female + poliscimajor + top20phd+top50phd+phdprograms+ advancedpercent + petitioners + nsfgrants+ yearstoelection +hhsera + seniority) xtable(repl.logit.fit,digits=3,label="tab:replication", caption="Replication of Table 1 in Uscinski and Klofstand (2010)") cov.lab<-c("Intercept", "Democrat", "Gender (Female)","Political Science Major in College", "Number of Top 20 Political Science Programs", "Number of Top 50 Political Science Programs", "Total Number of Political Science Programs", "Percentage with Advanced Degrees", "Number of Amendment Petitioners", "Number of NSF Grants 2008", #"Party Identification (Democrat)", "Years to Next Election", "Member of Labor HHS Subcommittee", "Seniority") stargazer(simple.logit.fit, repl.logit.fit, title = "Replication of Table 1 in Uscinski and Klofstand (2010)", label=c("tab:repl1"), covariate.labels=c( "Democrat", "Gender (Female)","Political Science Major in College", "Number of Top 20 Political Science Programs", "Number of Top 50 Political Science Programs", "Total Number of Political Science Programs", "Percentage with Advanced Degrees", "Number of Amendment Petitioners", "Number of NSF Grants 2008", #"Party Identification (Democrat)", "Years to Next Election", "Member of Labor HHS Subcommittee", "Seniority"), #column.labels=c("Coefficients"), star.cutoffs=NA, digits=3, dep.var.labels = "Vote Nay on Coburn Amendment") ### Figure 3.3 ; also code example 3.3 lwg.range <- seq(from=min(lfp$wage),to=max(lfp$wage),by=.1) #women w/o young kids x.lo <- c(1, #intercept 0, #young.kids median(lfp$school.kids), median(lfp$age), 0, #college median(lfp$wage)) X.lo <- matrix(x.lo, nrow=length(x.lo), ncol=length(lwg.range)) X.lo[6,] <- lwg.range #replacing with different wage values #women with college x.hi <- c(1, #intercept 1, #young.kids median(lfp$school.kids), median(lfp$age), 0, #college median(lfp$wage)) X.hi <- matrix(x.hi, nrow=length(x.hi), ncol=length(lwg.range)) X.hi[6,] <- lwg.range #replacing with different wage values B.tilde <- mvrnorm(1000, coef(fit.glm), vcov(fit.glm)) #1000 draws of coefficient vectors s.lo <- inv.logit(B.tilde %*% X.lo) #matrix of predicted probabilities s.hi <- inv.logit(B.tilde %*% X.hi) #matrix of predicted probabilities s.lo<-apply(s.lo, 2, quantile, c(0.025, 0.5, .975)) s.hi<-apply(s.hi, 2, quantile, c(0.025, 0.5, .975)) #pdf(file = "mrozdiffplot.pdf", width = 6, height = 6, onefile = TRUE, family = "Times") plot(lwg.range, s.lo[2,], ylim=c(0,.9), xlab = "wage index", ylab = "Predicted Probability of LFP", main = "Wages, Children, and LFP", bty="n", col="white",las=1) polygon(x=c(lwg.range, rev(lwg.range)), y=c(s.lo[1,], rev(s.lo[3,])), col=t_col(grey(0.8), percent=70), border=NA) polygon(x=c(lwg.range, rev(lwg.range)), y=c(s.hi[1,], rev(s.hi[3,])), col=t_col(grey(0.8), percent=70), border=NA) lines(lwg.range, s.hi[2,], lty=3, lwd=2) lines(lwg.range, s.lo[2,], lwd=2) legend(-2, 0.9, legend = c("No Young Children", "One Young Child "),lty = c(1,3),lwd=3) #dev.off() ###################################################### ### Figure 3.4 ####################################### mar.default <- c(5,4,4,2) + 0.1 par(mar = mar.default + c(0, 12, 0, 0)) lcb<-coef(repl.logit.fit)-2*sqrt(diag(vcov(repl.logit.fit))) ucb<-coef(repl.logit.fit)+2*sqrt(diag(vcov(repl.logit.fit))) lcb<-lcb[-1] #omit intercept ucb<-ucb[-1] coefplot(repl.logit.fit, varnames=cov.lab, main="", cex.pts=1.1, lwd=2, lower.conf.bounds=lcb, upper.conf.bounds=ucb) lcb<-coef(simple.logit.fit)-2*sqrt(diag(vcov(simple.logit.fit))) ucb<-coef(simple.logit.fit)+2*sqrt(diag(vcov(simple.logit.fit))) lcb<-lcb[-1] #omit intercept ucb<-ucb[-1] coefplot(simple.logit.fit, varnames=c("constant","Democrat"), cex.pts=1.1, lwd=2, lower.conf.bounds=lcb, offset=0.4, upper.conf.bounds=ucb, add=T, col.pts=grey(0.5)) ###################################################### ### Table 3.9 ######################################## predicted<-fitted(repl.logit.fit) actual<-as.numeric(repl.logit.fit$model$voteno) pred.simp<-fitted(simple.logit.fit) brier(actual,predicted) brier(actual,pred.simp) pred<-predicted*0.0 pred[predicted>=.5]<-1 tab<-table(pred,actual) names(tab)<-c("Observed 0","Observed 1") rownames(tab)<-c("Predicted 0","Observed 1") tab1<-xtable(table(pred,actual),caption="Predicted at c=.5 or greater.") names(tab1)<-c("Observed 0","Observed 1") row.names(tab1)<-c("Predicted 0","Predicted 1") tab1 ###################################################### ### Figure 3.5 ####################################### predicted<-fitted(repl.logit.fit) actual<-as.vector(repl.logit.fit$model$voteno) pred<-prediction(predicted,actual) perf<-performance(pred,"tpr","fpr") predsimp<-prediction(pred.simp,actual) perf.simp<-performance(predsimp,"tpr","fpr") #pdf(file = "coburnROC.pdf", width=5, height=5, onefile=TRUE) par(las=1, bty="n") plot(perf,main="ROC plots for competing models", bty="n",lwd=2) plot(perf.simp, lwd=2, col=grey(0.7), add=T) lines(actual,actual, lty="dashed") #dev.off() ###################################################### ### Figure 3.6 ####################################### predicted<-fitted(repl.logit.fit) actual<-as.numeric(repl.logit.fit$model$voteno) pred.simp<-fitted(simple.logit.fit) brier(actual,predicted) brier(actual,pred.simp) #pdf(file="coburnseparation.pdf") par(mfrow=c(2,1)) separationplot(pred.simp,actual, heading = "Party-only Model", height = .75, width=3, lwd1 = 0.9, newplot=F) separationplot(predicted,actual, heading = "Full Model", height = .75, width=3, lwd1 =1, lwd2=1, newplot=F) par(mfrow=c(1,1)) dev.off() ###################################################### ### Table 3.10 ####################################### wald.test(b = coef(repl.logit.fit), Sigma = vcov(repl.logit.fit),Terms=c(1:10)) or<-exp(cbind(OR = coef(repl.logit.fit), confint(repl.logit.fit))) stargazer(or, label=c("tab:repl2"),digits=2, title = "Odds-Ratios for Table 1 in Uscinski and Klofstand (2010)", covariate.labels=c("Gender (Female)","Political Science Major in College", "Number of Top 20 Political Science Programs", "Number of Top 50 Political Science Programs", "Total Number of Political Science Programs", "Percentage with Advanced Degrees", "Number of Amendment Petitioners", "Number of NSF Grants 2008", "Party Identification (Democrat)", "Years before Next Election", "Member of Labor HHS Subcommittee", "Seniority") ) ###################################################### ### Figure 3.7 ####################################### simdata.dem <- with(repl.df, data.frame( female=min(female), poliscimajor=max(poliscimajor), top20phd=mean(top20phd), top50phd=mean(top50phd), phdprograms=mean(phdprograms), advancedpercent=mean(advancedpercent), petitioners=mean(petitioners), nsfgrants=min(nsfgrants), democrat=1, yearstoelection=c(1,3,5), #yearstoelection=seq(min(yearstoelection),max(yearstoelection), length=100), hhsera=max(hhsera), seniority=min(hhsera) )) simdata.rep <- with(repl.df, data.frame( female=min(female), poliscimajor=max(poliscimajor), top20phd=mean(top20phd), top50phd=mean(top50phd), phdprograms=mean(phdprograms), advancedpercent=mean(advancedpercent), petitioners=mean(petitioners), nsfgrants=min(nsfgrants), democrat=0, yearstoelection=c(1,3,5), #yearstoelection=seq(min(yearstoelection),max(yearstoelection), length=100), hhsera=max(hhsera), seniority=min(hhsera) )) newdat.dem <- cbind(simdata.dem, predict(repl.logit.fit, newdata = simdata.dem, type="link", se=TRUE ) ) newdata.dem <- within(newdat.dem, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) newdat.rep <- cbind(simdata.rep, predict(repl.logit.fit, newdata = simdata.rep, type="link", se=TRUE ) ) newdata.rep <- within(newdat.rep, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) }) #pdf("coburninterp.pdf") plot(simdata.dem$yearstoelection, newdata.dem$PredictedProb, ylim=c(0,1.1), xlab = "Years to Election", ylab = "Predicted Probability of Nay Vote", las=1, main = "Voting on the Coburn Amendment", bty="n", yaxp = c(0, 1, 5), col="white", xlim=c(1,5.1), xaxp=c(1,5,2)) lines(simdata.dem$yearstoelection, newdata.dem$PredictedProb, type="b", col=grey(0.45), lty=2) lines(simdata.dem$yearstoelection+0.15, newdata.rep$PredictedProb, pch=16, type="b", lty=2) arrows(x0=simdata.dem$yearstoelection, x1=simdata.dem$yearstoelection, y0=newdata.dem$LL, y1=newdata.dem$UL, col=grey(0.45), angle=90, code=3, length=.05, lwd=1.2) arrows(x0=simdata.rep$yearstoelection+.15, x1=simdata.rep$yearstoelection+.15, y0=newdata.rep$LL, y1=newdata.rep$UL, angle=90, code=3, length=.05,lwd=1.2) legend(3.5, 0.3, legend = c("Democrats", "Other"),pch = c(1,16),col=c(grey(0.45),"black")) #dev.off() ###################################################### ### Figure 3.8 ####################################### #pdf(file = "binlinks.pdf") x<-seq(-5,5,by=.1) plot(x,plogis(x), type="l",bty="n",lwd=2, xlab = expression(x[i]^T*beta), las=1, mar=c(5, 5, 4, 2) + 0.1, ylab= expression(theta), main="", col=gray(0.5), lty=2) lines(x,pnorm(x), lwd=2, col=gray(0.5)) lines(x,cloglog(x,inverse=T), lwd=3) legend(2,.4, legend=c("cloglog", "probit", "logit"), lty=c(1,1,2), lwd=c(3,2,2), col=c(1,gray(0.5),gray(0.5))) #dev.off() ###################################################### ### Table 3.11 ####################################### # Cook and Savan Example library(haven) mydata<-read_dta("MilitaryRegimes_CS_JPR.dta") mydata<-slide(mydata,Var="ln_gdp", GroupVar = "ccode", NewVar = "gdpLag", TimeVar ="year", slideBy=-1) mydata<-slide(mydata,Var="ln_pop", GroupVar = "ccode", NewVar = "popLag", TimeVar ="year", slideBy=-1) mydata.sel<-subset(mydata,gwf_demo_plus==1) names(mydata.sel)[62]<-c("spline1") names(mydata.sel)[63]<-c("spline2") names(mydata.sel)[64]<-c("spline3") # prune variables to those needed small.df<-mydata.sel[,c(1:5,11,32,33,36,61:64,97,98)] # clean up names names(small.df)[6]<-c("onset") names(small.df)[7]<-c("military") names(small.df)[8]<-c("party") names(small.df)[9]<-c("personal") #Code to generate clustered standard errors #hacked from in cluster.gs.glm ClusterSEs clust.se<-function(mod, dat, cluster){ form <- mod$formula variables <- all.vars(form) clust.name <- all.vars(cluster) used.idx <- which(rownames(dat) %in% rownames(mod$model)) dat <- dat[used.idx, ] clust <- as.vector(unlist(dat[[clust.name]])) G <- length(unique(clust)) ind.variables <- names(coefficients(mod)) cl <- function(dat, fm, cluster){ M <- length(unique(cluster)) N <- length(cluster) K <- fm$rank dfc <- (M/(M - 1)) uj <- apply(estfun(fm), 2, function(x) tapply(x, cluster, sum)) vcovCL <- dfc * sandwich(fm, meat. = crossprod(uj)/N) out.tab<-coeftest(fm, vcovCL) return(list(vcovCL,out.tab)) } out<-cl(dat, mod, clust) out[[3]]<-out[[2]][ind.variables, 2] names(out)<-c("vcov", "coeftest", "se") return(out) } model.2.logit <- glm(onset ~ military + personal + party + gdpLag + popLag + peaceyrs + spline1 + spline2 + spline3 , data=small.df, family=binomial(link = "logit")) cses.logit<-clust.se(model.2.logit, cluster = ~ccode, dat=small.df) model.2.probit <- glm(onset ~ military + personal + party + gdpLag + popLag + peaceyrs + spline1 + spline2 + spline3 , data=small.df, family=binomial(link = "probit")) cses.probit<-clust.se(model.2.probit, cluster = ~ccode, dat=small.df) model.2.hprobit <- hetglm(onset ~ military + personal + party + gdpLag + popLag + peaceyrs + spline1 + spline2 + spline3| popLag, data=small.df, family=binomial(link = "probit")) cses.hprobit<-clust.se(model.2.hprobit, cluster = ~ccode, dat=small.df) model.2.cloglog <- glm(onset ~ military + personal + party + gdpLag + popLag + peaceyrs + spline1 + spline2 + spline3 , data=small.df, family=binomial(link = "cloglog")) cses.cloglog<-clust.se(model.2.cloglog, cluster = ~ccode, dat=small.df) model.2.loglog <- glm(I((onset-1)^2) ~ military + personal + party + gdpLag + popLag + peaceyrs + spline1 + spline2 + spline3 , data=small.df, family=binomial(link = "cloglog")) cses.loglog<-clust.se(model.2.loglog, cluster = ~ccode, dat=small.df) ### Table 3.11 stargazer(model.2.logit, model.2.probit, model.2.hprobit, model.2.cloglog,model.2.loglog, se = list(cses.logit$se, cses.probit$se, cses.hprobit$se, cses.cloglog$se,cses.loglog$se), title="Replication and extenstion of Table II in Cook and Savun (2016)", covariate.labels = c("Military", "Personal", "Party", "GDP (lagged)", "Population (lagged)", "Peace years", "spline1", "spline2", "spline3"), omit = "spline", star.cutoffs=NA, digits=2, dep.var.labels ="Civil Conflict Onset", column.labels = c("logit", "probit", "heterskedastic probit", "clog-log", "log-log"), label="table:CSrepl", note="Standard errors are clustered by country, following Cook and Savun.") ### Figure 3.9 hat.l<-fitted(model.2.logit) hat.p<-fitted(model.2.probit) hat.hp<-fitted(model.2.hprobit) hat.cll<-fitted(model.2.cloglog) hat.ll<-fitted(model.2.loglog) actual<-model.2.logit$y actual.ll<-(actual-1)^2 pred.l<-prediction(hat.l,actual) perf.l<-performance(pred.l,"tpr","fpr") pred.p<-prediction(hat.p,actual) perf.p<-performance(pred.p,"tpr","fpr") pred.hp<-prediction(hat.hp,actual) perf.hp<-performance(pred.hp,"tpr","fpr") pred.cll<-prediction(hat.cll,actual) perf.cll<-performance(pred.cll,"tpr","fpr") pred.ll<-prediction(hat.ll,actual.ll) perf.ll<-performance(pred.ll,"tpr","fpr") # Figure 3.9 #pdf(file = "CSROC.pdf", width=5, height=5, onefile=TRUE) par(las=1, bty="n") plot(perf.hp,main="ROC plots for competing models", bty="n",lwd=2) plot(perf.l,lwd=2, col="slateblue2", add=T) plot(perf.p, lwd=2, col=grey(0.85), add=T) plot(perf.cll, lwd=2, col=grey(0.5), add=T) plot(perf.cll, lwd=2, col=grey(0.5), add=T) lines(actual,actual, lty=3) ######################################################