In other words, the goals are to:
# The first line sets an option for the final document that can be produced from
# the .Rmd file. Don't worry about it.
knitr::opts_chunk$set(echo = TRUE,
                      collapse = TRUE,
                      out.width="\\textwidth", # for larger figures 
                      attr.output = 'style="max-height: 200px"',
                      tidy = 'styler' # styles the code in the output 
                      )
# The next bit is quite powerful and useful. 
# First you define which packages you need for your analysis and assign it to 
# the p_needed object. 
p_needed <-
  c("ggplot2", "viridis", "MASS", "optimx", "scales", "foreign", 
    "separationplot", "patchwork", "stargazer", "ggplotify")
# Now you check which packages are already installed on your computer.
# The function installed.packages() returns a vector with all the installed 
# packages.
packages <- rownames(installed.packages())
# Then you check which of the packages you need are not installed on your 
# computer yet. Essentially you compare the vector p_needed with the vector
# packages. The result of this comparison is assigned to p_to_install.
p_to_install <- p_needed[!(p_needed %in% packages)]
# If at least one element is in p_to_install you then install those missing
# packages.
if (length(p_to_install) > 0) {
  install.packages(p_to_install)
}
# Now that all packages are installed on the computer, you can load them for
# this project. Additionally the expression returns whether the packages were
# successfully loaded.
sapply(p_needed, require, character.only = TRUE)
# This is an option for stargazer tables
# It automatically adapts the output to html or latex,
# depending on whether we want a html or pdf file
stargazer_opt <- ifelse(knitr::is_latex_output(), "latex", "html")
# only relevant for ggplot2 plotting
# setting a global ggplot theme for the entire document to avoid 
# setting this individually for each plot 
theme_set(theme_classic() + # start with classic theme 
  theme(
    plot.background = element_blank(),# remove all background 
    plot.title.position = "plot", # move the plot title start slightly 
    legend.position = "bottom" # by default, put legend on the bottom
  ))As usual, we start with some fake data. But this time it will be slightly different.
The first part will look familiar.
# N of Population
n <- 1000
# Set our true Parameters
beta0 <- 0.25
beta1 <- -0.8
# Randomly draw an Independent Variable
set.seed(2021)
X <- rnorm(n, 0, 1)Now we need some additional lines.
We generate p (aka \(\pi\) on slide 10) via the logistic response function:
\[ \pi_i = \frac{\exp(X_i\beta)}{1+\exp(X_i\beta)} = \dfrac{\overbrace{\exp(\mu_i)}^{\text{always greater than 0}}}{\underbrace{1 + \exp(\mu_i)}_{\text{1 + "always greater than 0"}}} \Rightarrow 0 < \pi_i < 1 \]
Here mu is the linear predictor. Together, the linear predictor and the response function constitute the systematic component of the model. By using the logistic response function, we are ensuring it that with any value of \(X\) or \(\beta\), the systematic component is reparametarized to be between 0 and 1.
mu <- beta0 + beta1 * X
p <- exp(mu) / (1 + exp(mu))
# we achieve the same with the following code:
p <- plogis(mu)Let’s quickly plot what we just did:
par(mfrow = c(1, 2))
# Systematic component: linear predictor
plot(
  x = X,
  y = mu,
  pch = 19,
  cex = 0.5,
  col = viridis(1, 0.5),
  main = "Simulated linear predictor",
  font.main = 1,
  cex.main = 0.8,
  ylab = expression(mu[i]),
  xlab = expression(X[i]),
  las = 1,
  bty = "n"
)
text(
  x = 1.5,
  y = 1.8,
  labels = expression(mu[i] == X[i] * beta)
)
# Systematic component: predicted probabilities
plot(
  x = X,
  y = p,
  pch = 19,
  cex = 0.5,
  col = viridis(1, 0.5),
  ylim = c(0, 1),
  main = "Simulated predicted probabilities",
  font.main = 1,
  cex.main = 0.8,
  ylab = expression(p[i]),
  xlab = expression(X[i]),
  las = 1,
  bty = "n"
)
text(
  x = 1.5,
  y = 0.85,
  labels = expression(pi[i] == frac(exp(mu[i]), exp(1 + mu[i])))
)As we observe only 0 or 1, we draw from a Bernoulli distribution (the stochastic component of the model) with p = p (\(\pi\)). Recall that in R, we take draws from Bernoulli by working with its more general form, the Binomial distribution, but specify the size = 1, i.e. that we only take one draw from this distribution:
Y <- rbinom(n, 1, p)Let’s add this step to the plot above:
par(mfrow = c(1, 3))
# Systematic component: LP (unobserved in reality)
plot(
  x = X,
  y = mu,
  pch = 19,
  cex = 0.5,
  col = ifelse(Y == 1, viridis(2, 0.5)[1], viridis(2, 0.5)[2]),
  main = "Simulated Linear Predictor",
  font.main = 1,
  cex.main = 0.8,
  ylab = expression(mu[i]),
  xlab = expression(X[i]),
  las = 1,
  bty = "n"
)
text(
  x = 1.5,
  y = 1.8,
  labels = expression(mu[i] == X[i] * beta)
)
# Systematic component: predicted probabilities (unobserved in reality)
plot(
  x = X,
  y = p,
  pch = 19,
  cex = 0.5,
  ylim = c(0, 1),
  col = ifelse(Y == 1, viridis(2, 0.5)[1], viridis(2, 0.5)[2]),
  main = "Simulated predicted probabilities",
  font.main = 1,
  cex.main = 0.8,
  ylab = expression(p[i]),
  xlab = expression(X[i]),
  las = 1,
  bty = "n"
)
text(
  x = 1.5,
  y = 0.85,
  labels = expression(pi[i] == frac(exp(mu[i]), exp(1 + mu[i])))
)
# observed values
plot(
  x = X,
  y = Y,
  pch = 19,
  cex = 0.5,
  col = ifelse(Y == 1, viridis(2, 0.5)[1], viridis(2, 0.5)[2]),
  main = "Simulated observed values Y",
  font.main = 1,
  cex.main = 0.8,
  ylab = expression(Y[i]),
  xlab = expression(X[i]),
  las = 1,
  bty = "n"
)Now we want to estimate a logit model, i.e., to get back from the observed probabilities \(Y\) and the values of \(X\) to the coefficients \(\beta\) and probabilities \(\pi\).
# Systematic component: LP
syst <- ggplot() +
  scale_y_continuous(limits = c(-3, 3)) +
  scale_x_continuous(limits = c(-3.5, 3.5)) +
  geom_point(aes(
    x = X, y = mu,
    color = Y
  )) +
  labs(
    x = expression(X[i]),
    y = expression(mu[i]),
    title = "Linear Predictor"
  ) +
  annotate("text",
    label = expression(mu[i] == X[i] * beta),
    x = 1, y = 1.8,
    hjust = 0
  )
# Systematic component: predicted probabilities
pps <- ggplot() +
  scale_y_continuous(limits = c(0, 1)) +
  scale_x_continuous(limits = c(-3.5, 3.5)) +
  geom_point(aes(
    x = X, y = p,
    color = Y
  )) +
  labs(
    x = expression(X[i]),
    y = expression(p[i]),
    title = "Predicted Probabilities"
  ) +
  annotate("text",
    label = expression(pi[i] == frac(exp(mu[i]), exp(1 + mu[i]))),
    x = 1, y = 0.85
  )
# observed values
yhat <- ggplot() +
  scale_y_continuous(limits = c(0, 1)) +
  scale_x_continuous(limits = c(-3.5, 3.5)) +
  geom_point(aes(
    x = X, y = Y,
    color = Y
  )) +
  labs(
    x = expression(X[i]),
    y = expression(Y[i]),
    title = "Observed Probabilities"
  )
syst + pps + yhat +
  plot_annotation(title = "Simulated Quantities") &
  scale_color_viridis(direction = -1) & theme(legend.position = "none")Now we want to estimate a logit model, i.e., to get back from the observed probabilities \(Y\) and the values of \(X\) to the coefficients \(\beta\) and probabilities \(\pi\).
We start by writing our own logistic response function in R: \[\pi = \dfrac{\overbrace{\exp(x)}^{\text{always greater than 0}}}{\underbrace{1 + \exp(x)}_{\text{1 + "always greater than 0"}}} \Rightarrow 0 < \pi < 1\].
l_response <- function(x) {
  exp(x) / (1 + exp(x))
}Check if your function works:
head(l_response(beta0 + beta1 * X))As we want to estimate \(\beta_0\) and \(\beta_1\) using MLE, we need to set up a (log-)likelihood function.
For this, we translate the Log-likelihood of the Logit model from slide 14 to R.
\[ \ell(\beta|y, X) = \sum_{i=1}^{n} [y_i \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)} + (1-y_i) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)})] \]
This looks messier than it is.
What happens to the log-likelihood function when \(y_i = 0\)?
\[ y_i \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)} + (1-y_i) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \\= \underbrace{0 \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}}_{\text{cancels out}} + \underbrace{(1-0) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)})}_{\text{stays}} \\= \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \]
The opposite is true when \(y_i = 1\):
\[ y_i \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)} + (1-y_i) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \\= \underbrace{1 \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}}_{\text{stays}} + \underbrace{(1-1) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)})}_{\text{cancels out}} \\= \log(\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \] Depending on the value we observe, \(y_i = 0\) or \(y_i = 1\), we receive these reduced parts of the log-likelihood function for each observation and sum across them.
Now we can translate this into R.
Remember that we already have the response function \(\dfrac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}\) implemented as l_response().
logit_ll <- function(X, Y, theta) {
  beta0 <- theta[1]
  beta1 <- theta[2]
  mu <- theta[1] + theta[2] * X
  logl <- sum(Y * log(l_response(mu)) + (1 - Y) * log(1 - l_response(mu)))
  return(logl)
}Thanks to this reparameterization of the linear predictor mu with the l_response, we can estimate the theta as unbounded, yet still ensuring that the predicted probabilities will be within the range of 0 and 1.
Now we maximize it numerically with the help of optimx.
# Set Starting Values
stval <- c(0, 0) # 2 values of theta -> 2 starting values
# Optimize
res <- optimx(
  par = stval, # start values
  fn = logit_ll, # log-likelihood function
  Y = Y,
  X = X,
  control = list(maximize = T)
)
## Maximizing -- use negfn and neggr
res
##                    p1         p2    value fevals gevals niter convcode kkt1
## Nelder-Mead 0.2442764 -0.8836574 -608.544     59     NA    NA        0 TRUE
## BFGS        0.2441709 -0.8837175 -608.544     44     10    NA        0 TRUE
##             kkt2 xtime
## Nelder-Mead TRUE  0.03
## BFGS        TRUE  0.03Luckily, this is all implemented in R already.
Have a look at the documentation of glm.
?glmLet’s make use of it:
m0 <- glm(Y ~ X,
  family = binomial(link = logit)
)
summary(m0)## 
## Call:
## glm(formula = Y ~ X, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4003  -1.0666   0.6082   0.9917   1.9966  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.24415    0.06918   3.529 0.000417 ***
## X           -0.88379    0.07805 -11.323  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1376.3  on 999  degrees of freedom
## Residual deviance: 1217.1  on 998  degrees of freedom
## AIC: 1221.1
## 
## Number of Fisher Scoring iterations: 4
With glm we can also easily implement a probit model:
m0_probit <- glm(Y ~ X,
  family = binomial(link = probit)
)
m0_probit$coefficients
## (Intercept)           X 
##   0.1463777  -0.5333905Why do the coefficients differ for logit and probit models? (Slide 15 might help.)
The predicted probability from the logit model would be:
\[\hat\pi_i = \text{Pr}(y_i = 1) = \dfrac{\exp(X_i\hat\beta)}{1 + \exp(X_i\hat\beta)} = \dfrac{\exp(0.24 -0.88\cdot x_i)}{1 + \exp(0.24 -0.88\cdot x_i)}\]
head(l_response(coef(m0)[1] + coef(m0)[2] * X))
## [1] 0.5871943 0.4392744 0.4840094 0.4815857 0.3659704 0.8747143
# head(plogis(coef(m0)[1] + coef(m0)[2] * X)\[1 - \hat\pi_i = \text{Pr}(y_i = 0) = 1 - \dfrac{\exp(X_i\hat\beta)}{1 + \exp(X_i\hat\beta)} = 1 - \dfrac{\exp(0.24 -0.88\cdot x_i)}{1 + \exp(0.24 -0.88\cdot x_i)}\]
head(1 - l_response(coef(m0)[1] + coef(m0)[2] * X))
## [1] 0.4128057 0.5607256 0.5159906 0.5184143 0.6340296 0.1252857
# head(1 - plogis(coef(m0)[1] + coef(m0)[2] * X)These values together will sum to 1:
head(l_response(coef(m0)[1] + coef(m0)[2] * X) + (1 - l_response(coef(m0)[1] + coef(m0)[2] * X)))
## [1] 1 1 1 1 1 1
# head(plogis(coef(m0)[1] + coef(m0)[2] * X) +
#        (1 - plogis(coef(m0)[1] + coef(m0)[2] * X)))The predicted probability from the probit model would be:
\[\hat\pi_i = \text{Pr}(y_i = 1) = \Phi(X_i\hat\beta) = \Phi(0.15 - 0.53 \cdot x_i),\] where \(\Phi(\cdot)\) is the CDF of a standard normal distribution \(\mathcal{N}(0,1)\).
head(pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X))
## [1] 0.5838282 0.4410540 0.4842106 0.4818756 0.3697047 0.8793730\[1 - \hat\pi_i = \text{Pr}(y_i = 0) = 1 - \Phi(X_i\hat\beta) = 1 - \Phi(0.15 - 0.53 \cdot x_i)\]
head(1 - pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X))
## [1] 0.4161718 0.5589460 0.5157894 0.5181244 0.6302953 0.1206270These values together will sum to 1:
head(pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X) +
  (1 - pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X)))
## [1] 1 1 1 1 1 1The coefficients of logit and (even more so) probit models are really hard to interpret beyond their sign. Now our simulation approach, to get meaningful quantities of interest, becomes really helpful.
Let’s start with plotting expected probabilities of our fake data model.
Remember the steps?
Steps for simulating parameters (Estimation Uncertainty):
beta_hat).V_hat).N(beta_hat, V_hat).nsim times.# 1. get the coefficients
beta_hat <- coef(m0)
# 2. Get the variance-covariance matrix
V_hat <- vcov(m0)
# 3. Set up a multivariate normal distribution N(beta_hat, V_hat)
# 4. Draw from the distribution nsim times
nsim <- 10000
S <- mvrnorm(
  n = nsim,
  mu = beta_hat,
  Sigma = V_hat
)Next step: set up an interesting scenarios and calculate expected values.
# we simulate over a sequence of x-values
seq_X <- seq(min(X), max(X), length.out = 100)
scenario <- cbind(1, seq_X)
# linear predictor
Xbeta <- S %*% t(scenario)
dim(Xbeta)
## [1] 10000   100So far there is nothing new compared to OLS simulation!
Now comes the difference:
To get expected values for \(p\) (aka predicted probabilities), we need to plug in the \(X_i\beta\) (Xbeta) values into the response function to get simulated probabilities.
p_sim <- (exp(Xbeta)) / (1 + exp(Xbeta))
# we can also use our own function:
p_sim <- l_response(Xbeta)
# or the build-in R-function for logistic distribution function (CDF):
p_sim <- plogis(Xbeta) # aka inverse logit
dim(p_sim)
## [1] 10000   100As before, we also want means and quantiles of our quantities of interest.
p_mean <- apply(p_sim, 2, mean)
p_qu <- t(apply(p_sim, 2, quantile, prob = c(0.025, 0.975)))plot(
  x = seq_X,
  y = p_mean,
  ylim = c(0, 1),
  xlim = range(pretty(seq_X)),
  type = "n",
  main = "Predicted Probabilies of Y",
  ylab = "Probability of Y",
  xlab = "X",
  bty = "n",
  las = 1
)
# plot uncertainty with a polygon
polygon(
  x = c(rev(seq_X), seq_X),
  y = c(rev(p_qu[, 2]), p_qu[, 1]),
  col = viridis(1, 0.4),
  border = NA
)
# and a line
lines(
  x = seq_X,
  y = p_mean,
  lwd = 2
)Remember that we know the true data generating process (because we simulated the data). This means that we also know the true probability of Y for each observation. Let’s see how well our estimation (and simulation) worked:
plot(
  x = seq_X,
  y = p_mean,
  ylim = c(0, 1),
  xlim = range(pretty(seq_X)),
  type = "n",
  main = "Predicted Probabilies of Y",
  ylab = "Probability of Y",
  xlab = "X",
  bty = "n",
  las = 1
)
# plot uncertainty with a polygon
polygon(
  x = c(rev(seq_X), seq_X),
  y = c(rev(p_qu[, 2]), p_qu[, 1]),
  col = viridis(2, 0.4)[1],
  border = NA
)
# and a line
lines(
  x = seq_X,
  y = p_mean,
  lwd = 2
)
# we add the "true" probabilities
# note that these are unobserved in the real world
points(
  x = X,
  y = p,
  pch = 1,
  cex = 0.5,
  ylim = c(0, 1),
  col = viridis(2, 0.4)[2]
)# data frame for predicted probabilities and CI
plot_df <- data.frame(
  "p_mean" = p_mean,
  "ci_lo" = p_qu[, 1],
  "ci_hi" = p_qu[, 2]
)
plot_df$seq_X <- seq_X
# plot
pp <- ggplot(data = plot_df, aes(x = seq_X, y = p_mean)) +
  geom_line() +
  geom_ribbon(
    aes(ymin = ci_lo, ymax = ci_hi),
    fill = viridis(1, 0.5), # add color for filling
    color = viridis(1), # add color for lines
    linetype = "dashed" # make lines dashed
  ) +
  labs(
    x = "X",
    y = "Probability of Y",
    title = "Predicted Probabilities of Y"
  )
ppRemember that we know the true data generating process (because we simulated the data). This means that we also know the true probability of Y for each observation. Let’s see how well our estimation (and simulation) worked:
fake_data <- data.frame(X, p)
pp + geom_point(
  data = fake_data,
  aes(x = X, y = p),
  shape = 1,
  color = viridis(2, 0.4)[2]
)How can we check how good our logit model actually is?
An easy test is to cross-tabulate observed and predicted values of y. For this we classify the predicted probabilities as either 0 or 1. This of course depends on some cut-point. Usually we take a cutoff point of 0.5.
Let’s do this for our model: m0$fitted.values > 0.5 will generate the logical values for whether the fitted value is above or below the threshold TRUE and FALSE, and we will multiply them by *1 to make TRUE equal 1 and FALSE equal zero.
table(
  observed = Y,
  predicted = (m0$fitted.values > 0.5) * 1
) # or simply round(m0$fitted.values)
##         predicted
## observed   0   1
##        0 255 195
##        1 138 412How many correctly predicted? How many falsely predicted?
Let’s make a classification table and calculate the percentage of correctly predicted (aka accuracy) cases (PCP; see slide 25).
class_table <- table(observed = Y, predicted = (m0$fitted.values > 0.5) * 1)
class_table
##         predicted
## observed   0   1
##        0 255 195
##        1 138 412
pcp <- sum(diag(class_table)) / sum(class_table)
pcp
## [1] 0.667Another very good way to asses model fit are separation plots.
Let’s install the package separationlpot to get started (already done).
Separation plot is a method to visually inspect the fit of a binary model. In this plot, the dark and light panels correspond to the actual instances of the events \((y_i = 1)\) and nonevents \((y_i = 0)\), respectively. Construction of the separation plot begins by simply rearranging the observations such that the fitted values, i.e. the predicted probabilities \(\pi_i\) are presented in ascending order (you can spot it on the black line on the plot). Then one notes whether each of these observations corresponds to an actual instance of the event \((y_i = 1)\) or a nonevent \((y_i = 0)\). These are depicted with the color panels mentioned above. The more nonevents are on the left side and the more events are on the right side, the better the model fit.
Greenhill, Ward, and Sacks (2011) explain this in more detail:
A model with no predictive power-i.e., one whose outcomes can be approximated by a random coin toss—would generate an even distribution of 0s and 1s along the column on the right-hand side. On the other hand, a model with perfect predictive power would produce a complete separation of the 0s and 1s in the right-hand column: low fitted values would always turn out to be associated with actual instances of peace (0s), whereas high fitted values would always be associated with actual instances of war (1s).
A perfect model would show complete separation of the dark and light‐colored rows like this:
The dark line, the respective \(\pi_i\), allows us to judge whether the overall degree of separation between events and nonevents is associated with sharp differences in predicted probabilities, or more modest differences.
Another quantity of interest is the expected number of total events predicted by the model. We can calculate this by simply adding up the predicted probabilities across all observations and rounding the value. This quantity, depicted with a small black triangle on the plot, allows to see how the total number of events predicted by the model compares to the actual number of events in the data.
This is what the separation plot for our fake data would look like:
separationplot(
  pred = m0$fitted,
  actual = Y,
  line = TRUE,
  heading = "Separation Plot for Model 0",
  show.expected = T,
  col0 = viridis(2, alpha = 0.5)[2],
  col1 = viridis(2, alpha = 0.5)[1],
  lwd2 = 2,
  height = 2,
  newplot = F
)If you want to know more about Separation Plots and see more examples of using them, you should read the paper by Greenhill, Ward, and Sacks (2011).
In case you want to keep all plots as ggplot2 objects for some reason (say to keep the figure fonts uniform across the paper), you can transform these base R objects into ggplot2 kind with ggplotify package:
as.ggplot(~ separationplot(
  pred = m0$fitted,
  actual = Y,
  line = TRUE,
  show.expected = T,
  col0 = viridis(2)[2],
  col1 = viridis(2)[1],
  lwd2 = 2,
  newplot = F
)) +
  labs(title = "Separation Plot for Model 0")Let’s have a look at some real data!
We will work with the Fearon and Laitin (2003) dataset on ethnicity, insurgency, and civil wars.
First, we estimate the effect of log mountains (log_mountain) on onset of civil war (civilwar) using optimx and the functions from above.
We start with a simple bivariate model.
df <- read.dta("raw-data/fearon.dta")
stargazer(df, type = stargazer_opt)| Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max | 
| civilwar | 6,191 | 0.017 | 0.129 | 0 | 0 | 0 | 1 | 
| ethnicwar | 6,191 | 0.011 | 0.106 | 0 | 0 | 0 | 1 | 
| priorwar | 6,191 | 0.133 | 0.340 | 0 | 0 | 0 | 1 | 
| gdp_lagged | 6,191 | 3.609 | 4.327 | 0.048 | 0.930 | 4.469 | 53.901 | 
| log_population | 6,191 | 9.078 | 1.456 | 5.403 | 8.091 | 9.946 | 14.029 | 
| log_mountain | 6,191 | 2.177 | 1.409 | 0.000 | 0.993 | 3.378 | 4.557 | 
| noncontiguous | 6,191 | 0.178 | 0.383 | 0 | 0 | 0 | 1 | 
| oil | 6,191 | 0.128 | 0.334 | 0 | 0 | 0 | 1 | 
| newstate | 6,191 | 0.026 | 0.160 | 0 | 0 | 0 | 1 | 
| instability | 6,191 | 0.146 | 0.353 | 0 | 0 | 0 | 1 | 
| democracy1 | 6,191 | -0.479 | 7.555 | -10 | -7 | 8 | 10 | 
| ethnicfrac | 6,191 | 0.388 | 0.286 | 0.001 | 0.107 | 0.667 | 0.925 | 
| relifrac | 6,191 | 0.366 | 0.219 | 0.000 | 0.182 | 0.557 | 0.783 | 
| anocracy | 6,191 | 0.218 | 0.413 | 0 | 0 | 0 | 1 | 
| democracy2 | 6,191 | 0.335 | 0.472 | 0 | 0 | 1 | 1 | 
res2 <- optimx(stval,
  logit_ll,
  Y = df$civilwar,
  X = df$log_mountain,
  control = list(maximize = T)
)## Maximizing -- use negfn and neggr
res2##                    p1        p2     value fevals gevals niter convcode kkt1
## Nelder-Mead -4.740625 0.2764533 -521.1480     77     NA    NA        0 TRUE
## BFGS        -4.742026 0.2765875 -521.1479     40     16    NA        0 TRUE
##             kkt2 xtime
## Nelder-Mead TRUE  0.15
## BFGS        TRUE  0.15
Now we want to fit the same logit regression as above, but this time using glm:
m1 <- glm(civilwar ~ log_mountain,
  data = df,
  family = binomial(link = logit)
)Let’s see how we did and compare the results to our likelihood function.
m1$coefficients
##  (Intercept) log_mountain 
##   -4.7412093    0.2763476
res2[1:2, 1:2]
##                    p1        p2
## Nelder-Mead -4.740625 0.2764533
## BFGS        -4.742026 0.2765875We load the data and omit all observations with missing values.
df <- read.dta("raw-data/fearon_rep.dta")
summary(df)##     civilwar         ethnicwar         priorwar        gdp_lagged    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.048  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 0.930  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median : 1.995  
##  Mean   :0.01664   Mean   :0.0112   Mean   :0.1346   Mean   : 3.651  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.: 4.484  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :66.735  
##  NA's   :1         NA's   :1                         NA's   :237     
##  log_population    log_mountain    noncontiguous         oil        
##  Min.   : 5.403   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 8.076   1st Qu.:0.9933   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 9.004   Median :2.4248   Median :0.0000   Median :0.0000  
##  Mean   : 9.063   Mean   :2.1767   Mean   :0.1734   Mean   :0.1295  
##  3rd Qu.: 9.933   3rd Qu.:3.3393   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :14.029   Max.   :4.5570   Max.   :1.0000   Max.   :1.0000  
##  NA's   :177                                                        
##     newstate        instability       democracy1         ethnicfrac    
##  Min.   :0.00000   Min.   :0.0000   Min.   :-10.0000   Min.   :0.0010  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: -7.0000   1st Qu.:0.1073  
##  Median :0.00000   Median :0.0000   Median : -3.0000   Median :0.3255  
##  Mean   :0.02965   Mean   :0.1465   Mean   : -0.4799   Mean   :0.3854  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:  8.0000   3rd Qu.:0.6637  
##  Max.   :1.00000   Max.   :1.0000   Max.   : 10.0000   Max.   :0.9250  
##                    NA's   :14       NA's   :69                         
##     relifrac         anocracy        democracy2    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1818   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.3576   Median :0.0000   Median :0.0000  
##  Mean   :0.3674   Mean   :0.2256   Mean   :0.3322  
##  3rd Qu.:0.5566   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :0.7828   Max.   :1.0000   Max.   :1.0000  
##                   NA's   :69       NA's   :69
df2 <- na.omit(df)
summary(df2)##     civilwar        ethnicwar          priorwar        gdp_lagged     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   : 0.0480  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 0.9295  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median : 1.9980  
##  Mean   :0.0168   Mean   :0.01131   Mean   :0.1331   Mean   : 3.6085  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.: 4.4690  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :53.9010  
##  log_population    log_mountain    noncontiguous         oil        
##  Min.   : 5.403   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 8.091   1st Qu.:0.9933   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 9.018   Median :2.4248   Median :0.0000   Median :0.0000  
##  Mean   : 9.078   Mean   :2.1772   Mean   :0.1782   Mean   :0.1279  
##  3rd Qu.: 9.946   3rd Qu.:3.3776   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :14.029   Max.   :4.5570   Max.   :1.0000   Max.   :1.0000  
##     newstate        instability      democracy1         ethnicfrac    
##  Min.   :0.00000   Min.   :0.000   Min.   :-10.0000   Min.   :0.0010  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.: -7.0000   1st Qu.:0.1073  
##  Median :0.00000   Median :0.000   Median : -3.0000   Median :0.3331  
##  Mean   :0.02617   Mean   :0.146   Mean   : -0.4786   Mean   :0.3880  
##  3rd Qu.:0.00000   3rd Qu.:0.000   3rd Qu.:  8.0000   3rd Qu.:0.6666  
##  Max.   :1.00000   Max.   :1.000   Max.   : 10.0000   Max.   :0.9250  
##     relifrac         anocracy        democracy2   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.1818   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.3508   Median :0.0000   Median :0.000  
##  Mean   :0.3662   Mean   :0.2184   Mean   :0.335  
##  3rd Qu.:0.5566   3rd Qu.:0.0000   3rd Qu.:1.000  
##  Max.   :0.7828   Max.   :1.0000   Max.   :1.000
Let’s estimate the model:
m_fearon1 <- glm(
  civilwar ~
  priorwar +
    gdp_lagged +
    log_population +
    log_mountain +
    noncontiguous +
    oil +
    newstate +
    instability +
    democracy1 +
    ethnicfrac +
    relifrac,
  data = df,
  family = binomial(link = logit)
)In order to make sense of our results, let’s plot predicted probabilities for the onset of civil war given different levels of log_mountain, setting all other variables to their mean (continuous)/median (discrete).
Here is the scenario for you:
log_mountain_range <-
  seq(min(df$log_mountain, na.rm = T),
    max(df$log_mountain, na.rm = T),
    length.out = 100
  )
scenario <-
  cbind(
    1, # The Intercept
    median(df$priorwar, na.rm = T), # The median of priorwar
    mean(df$gdp_lagged, na.rm = T), # The mean of gdp_lagged
    mean(df$log_population, na.rm = T), # The mean of log_population
    log_mountain_range, # Our sequence for log_mountain
    median(df$noncontiguous, na.rm = T), # The median of noncontiguous
    median(df$oil, na.rm = T), # The median of oil
    median(df$newstate, na.rm = T), # The median of newstate
    median(df$instability, na.rm = T), # The median of instability
    mean(df$democracy1, na.rm = T), # The mean of democracy1
    mean(df$ethnicfrac, na.rm = T), # The mean of ethnicfrac
    mean(df$relifrac, na.rm = T) # The mean of relifrac
  )
# Have a look at the scenario
head(scenario)##                            log_mountain_range                            
## [1,] 1 0 3.651117 9.062762         0.00000000 0 0 0 0 -0.479896 0.3853547
## [2,] 1 0 3.651117 9.062762         0.04603060 0 0 0 0 -0.479896 0.3853547
## [3,] 1 0 3.651117 9.062762         0.09206121 0 0 0 0 -0.479896 0.3853547
## [4,] 1 0 3.651117 9.062762         0.13809181 0 0 0 0 -0.479896 0.3853547
## [5,] 1 0 3.651117 9.062762         0.18412241 0 0 0 0 -0.479896 0.3853547
## [6,] 1 0 3.651117 9.062762         0.23015302 0 0 0 0 -0.479896 0.3853547
##               
## [1,] 0.3673768
## [2,] 0.3673768
## [3,] 0.3673768
## [4,] 0.3673768
## [5,] 0.3673768
## [6,] 0.3673768
# Is everything in the correct order?
coef(m_fearon1)##    (Intercept)       priorwar     gdp_lagged log_population   log_mountain 
##    -6.57931110    -0.88696614    -0.35034944     0.24925147     0.21925279 
##  noncontiguous            oil       newstate    instability     democracy1 
##     0.34317774     0.83918510     1.70935127     0.64393145     0.02424965 
##     ethnicfrac       relifrac 
##     0.18804229     0.25036423
Now, it’s your turn.
First, could you describe the scenario in your own words?
Now, you can follow exactly the same steps as above:
beta_hat).V_hat).N(beta_hat, V_hat).nsim times.beta_hat <- 
V_hat <- 
library(MASS)
S <- Let’s use the scenario from above.
Xbeta <- To get expected values for p, we need to plug in the Xbeta values into the response function to get simulated probabilities.
p_sim <- Pro Question: How could we include fundamental uncertainty?
Now we also want means and quantiles:
p_mean <- 
p_qu <- This time we instantly plot a polygon and include ticks for actual x-values of our observations:
What can we learn from this plot?
Of course you could (and should!) also unlog the mountains…
# start with an empty plot
plot(
  x = exp(log_mountain_range),
  y = p_mean,
  type = "n",
  ylim = c(0, 0.045),
  ylab = "Probability of Civil War onset",
  xlab = "Mountainous Terrain in %",
  bty = "n",
  las = 1
)
# draw polygon
polygon(
  x = c(rev(exp(log_mountain_range)), exp(log_mountain_range)),
  y = c(rev(p_qu[, 2]), p_qu[, 1]),
  col = viridis(1, 0.4),
  border = NA
)
# add lines
lines(
  x = exp(log_mountain_range),
  y = p_mean, lwd = 2
)
lines(
  x = exp(log_mountain_range),
  y = p_qu[, 1],
  lty = "dashed",
  col = viridis(1)
)
lines(
  x = exp(log_mountain_range),
  y = p_qu[, 2],
  lty = "dashed",
  col = viridis(1)
)
# Adding ticks of actual x-values.
axis(1,
  at = exp(df$log_mountain),
  col.ticks = viridis(1),
  labels = FALSE,
  tck = 0.02
)Again, we want to assess model fit. Remember: simply running a model without testing for model fit is dangerous!
As above, we will look at classification tables and separation plots. Let’s do this for Model 1 from above.
# Expected Values (no fundamental uncertainty)
table(
  observed = factor(m_fearon1$model$civilwar, levels = 0:1),
  predicted = factor((m_fearon1$fitted.values > 0.5) * 1, levels = 0:1)
)
##         predicted
## observed    0    1
##        0 6087    0
##        1  104    0
# Predicted Values (including fundamental uncertainty)
table(
  observed = factor(m_fearon1$model$civilwar, levels = 0:1),
  predicted = factor(rbinom(
    n = length(m1$fitted.values),
    size = 1,
    p = m1$fitted.values
  ),
  levels = 0:1
  )
)
##         predicted
## observed    0    1
##        0 5983  104
##        1  104    0separationplot(m_fearon1$fitted, m_fearon1$model$civilwar,
  line = TRUE,
  heading = "Fearon/Laitin: Model 1",
  show.expected = T,
  height = 2,
  col0 = viridis(2)[2],
  col1 = viridis(2)[1],
  lwd2 = 2,
  newplot = F
)Again, we can also make classification tables and calculate the precentage of correctly predicted (aka accuracy) cases (PCP).
class_table1 <-
  table(
    observed = factor(m_fearon1$model$civilwar, levels = 0:1),
    predicted = factor((m_fearon1$fitted.values > 0.5) * 1, levels = 0:1)
  )
pcp_1 <- sum(diag(class_table1)) / sum(class_table1)
pcp_1
## [1] 0.9832014Sounds pretty good, doesn’t it?
But how good would a naive model - that is always predicting the majority class - predict civilwars?
That’s the so called Percent of observations in the Modal Category (PMC)
table(m_fearon1$model$civilwar)
## 
##    0    1 
## 6087  104
max(table(m_fearon1$model$civilwar)) / sum(table(m_fearon1$model$civilwar))
## [1] 0.9832014That’s exactly the same accuracy…
We can also make our life a little easier and write a function that will calculate the average case scenario for us:
# m_fearon1$model contains the data used to estimate the model
# there are no missing data there, and the variables are in the same order as
# the main terms in the regression equation (but this is not as straightforward
# with models with interactions or cases where we pass character or factor vars
# into the model)
# the first column is the dependent variable
head(m_fearon1$model)##   civilwar priorwar gdp_lagged log_population log_mountain noncontiguous oil
## 1        0        0      7.626       11.85630     3.214868             1   0
## 2        0        0      7.626       11.86313     3.214868             1   0
## 3        0        0      7.654       11.86859     3.214868             1   0
## 4        0        0      8.025       11.88673     3.214868             1   0
## 5        0        0      8.270       11.90488     3.214868             1   0
## 6        0        0      8.040       11.93343     3.214868             1   0
##   newstate instability democracy1 ethnicfrac relifrac
## 1        0           0         10  0.3569501    0.596
## 2        0           0         10  0.3569501    0.596
## 3        0           0         10  0.3569501    0.596
## 4        0           0         10  0.3569501    0.596
## 5        0           0         10  0.3569501    0.596
## 6        0           0         10  0.3569501    0.596
average_case <- function(x, rep = 1) {
  # if there only two unique values in x and  difference between them is 1
  # (i.e. it is a dummy variable 0/1, 1/2 or similar)
  if (!is.numeric(x)) {
    return(paste0("Transform variable to numeric"))
  }
  if (length(unique(x)) == 2 & diff(range(x)) == 1) {
    # we take a median
    rep(median(x, na.rm = T), rep)
  } else {
    # otherwise, we take a mean
    rep(mean(x, na.rm = T), rep)
  }
}
# calculate the average for all independent variables
scenario <- cbind(1, apply(m_fearon1$model[, -1], 2, average_case, rep = 100))
# substitute the values in one variable with a sequence
scenario[, which(colnames(scenario) == "log_mountain")] <-
  seq(min(df$log_mountain, na.rm = T),
    max(df$log_mountain, na.rm = T),
    length.out = 100
  )