In other words, the goals are to:
# The first line sets an option for the final document that can be produced from
# the .Rmd file. Don't worry about it.
::opts_chunk$set(echo = TRUE,
knitrcollapse = TRUE,
out.width="\\textwidth", # for larger figures
attr.output = 'style="max-height: 200px"',
tidy = 'styler' # styles the code in the output
)
# The next bit is quite powerful and useful.
# First you define which packages you need for your analysis and assign it to
# the p_needed object.
<-
p_needed c("ggplot2", "viridis", "MASS", "optimx", "scales", "foreign",
"separationplot", "patchwork", "stargazer", "ggplotify")
# Now you check which packages are already installed on your computer.
# The function installed.packages() returns a vector with all the installed
# packages.
<- rownames(installed.packages())
packages # Then you check which of the packages you need are not installed on your
# computer yet. Essentially you compare the vector p_needed with the vector
# packages. The result of this comparison is assigned to p_to_install.
<- p_needed[!(p_needed %in% packages)]
p_to_install # If at least one element is in p_to_install you then install those missing
# packages.
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}# Now that all packages are installed on the computer, you can load them for
# this project. Additionally the expression returns whether the packages were
# successfully loaded.
sapply(p_needed, require, character.only = TRUE)
# This is an option for stargazer tables
# It automatically adapts the output to html or latex,
# depending on whether we want a html or pdf file
<- ifelse(knitr::is_latex_output(), "latex", "html")
stargazer_opt
# only relevant for ggplot2 plotting
# setting a global ggplot theme for the entire document to avoid
# setting this individually for each plot
theme_set(theme_classic() + # start with classic theme
theme(
plot.background = element_blank(),# remove all background
plot.title.position = "plot", # move the plot title start slightly
legend.position = "bottom" # by default, put legend on the bottom
))
As usual, we start with some fake data. But this time it will be slightly different.
The first part will look familiar.
# N of Population
<- 1000
n
# Set our true Parameters
<- 0.25
beta0 <- -0.8
beta1
# Randomly draw an Independent Variable
set.seed(2021)
<- rnorm(n, 0, 1) X
Now we need some additional lines.
We generate p
(aka \(\pi\) on slide 10) via the logistic response function:
\[ \pi_i = \frac{\exp(X_i\beta)}{1+\exp(X_i\beta)} = \dfrac{\overbrace{\exp(\mu_i)}^{\text{always greater than 0}}}{\underbrace{1 + \exp(\mu_i)}_{\text{1 + "always greater than 0"}}} \Rightarrow 0 < \pi_i < 1 \]
Here mu
is the linear predictor. Together, the linear predictor and the response function constitute the systematic component of the model. By using the logistic response function, we are ensuring it that with any value of \(X\) or \(\beta\), the systematic component is reparametarized to be between 0 and 1.
<- beta0 + beta1 * X
mu
<- exp(mu) / (1 + exp(mu))
p
# we achieve the same with the following code:
<- plogis(mu) p
Let’s quickly plot what we just did:
par(mfrow = c(1, 2))
# Systematic component: linear predictor
plot(
x = X,
y = mu,
pch = 19,
cex = 0.5,
col = viridis(1, 0.5),
main = "Simulated linear predictor",
font.main = 1,
cex.main = 0.8,
ylab = expression(mu[i]),
xlab = expression(X[i]),
las = 1,
bty = "n"
)text(
x = 1.5,
y = 1.8,
labels = expression(mu[i] == X[i] * beta)
)
# Systematic component: predicted probabilities
plot(
x = X,
y = p,
pch = 19,
cex = 0.5,
col = viridis(1, 0.5),
ylim = c(0, 1),
main = "Simulated predicted probabilities",
font.main = 1,
cex.main = 0.8,
ylab = expression(p[i]),
xlab = expression(X[i]),
las = 1,
bty = "n"
)text(
x = 1.5,
y = 0.85,
labels = expression(pi[i] == frac(exp(mu[i]), exp(1 + mu[i])))
)
As we observe only 0 or 1, we draw from a Bernoulli distribution (the stochastic component of the model) with p = p (\(\pi\)). Recall that in R, we take draws from Bernoulli by working with its more general form, the Binomial distribution, but specify the size = 1
, i.e. that we only take one draw from this distribution:
<- rbinom(n, 1, p) Y
Let’s add this step to the plot above:
par(mfrow = c(1, 3))
# Systematic component: LP (unobserved in reality)
plot(
x = X,
y = mu,
pch = 19,
cex = 0.5,
col = ifelse(Y == 1, viridis(2, 0.5)[1], viridis(2, 0.5)[2]),
main = "Simulated Linear Predictor",
font.main = 1,
cex.main = 0.8,
ylab = expression(mu[i]),
xlab = expression(X[i]),
las = 1,
bty = "n"
)text(
x = 1.5,
y = 1.8,
labels = expression(mu[i] == X[i] * beta)
)
# Systematic component: predicted probabilities (unobserved in reality)
plot(
x = X,
y = p,
pch = 19,
cex = 0.5,
ylim = c(0, 1),
col = ifelse(Y == 1, viridis(2, 0.5)[1], viridis(2, 0.5)[2]),
main = "Simulated predicted probabilities",
font.main = 1,
cex.main = 0.8,
ylab = expression(p[i]),
xlab = expression(X[i]),
las = 1,
bty = "n"
)text(
x = 1.5,
y = 0.85,
labels = expression(pi[i] == frac(exp(mu[i]), exp(1 + mu[i])))
)
# observed values
plot(
x = X,
y = Y,
pch = 19,
cex = 0.5,
col = ifelse(Y == 1, viridis(2, 0.5)[1], viridis(2, 0.5)[2]),
main = "Simulated observed values Y",
font.main = 1,
cex.main = 0.8,
ylab = expression(Y[i]),
xlab = expression(X[i]),
las = 1,
bty = "n"
)
Now we want to estimate a logit model, i.e., to get back from the observed probabilities \(Y\) and the values of \(X\) to the coefficients \(\beta\) and probabilities \(\pi\).
# Systematic component: LP
<- ggplot() +
syst scale_y_continuous(limits = c(-3, 3)) +
scale_x_continuous(limits = c(-3.5, 3.5)) +
geom_point(aes(
x = X, y = mu,
color = Y
+
)) labs(
x = expression(X[i]),
y = expression(mu[i]),
title = "Linear Predictor"
+
) annotate("text",
label = expression(mu[i] == X[i] * beta),
x = 1, y = 1.8,
hjust = 0
)
# Systematic component: predicted probabilities
<- ggplot() +
pps scale_y_continuous(limits = c(0, 1)) +
scale_x_continuous(limits = c(-3.5, 3.5)) +
geom_point(aes(
x = X, y = p,
color = Y
+
)) labs(
x = expression(X[i]),
y = expression(p[i]),
title = "Predicted Probabilities"
+
) annotate("text",
label = expression(pi[i] == frac(exp(mu[i]), exp(1 + mu[i]))),
x = 1, y = 0.85
)
# observed values
<- ggplot() +
yhat scale_y_continuous(limits = c(0, 1)) +
scale_x_continuous(limits = c(-3.5, 3.5)) +
geom_point(aes(
x = X, y = Y,
color = Y
+
)) labs(
x = expression(X[i]),
y = expression(Y[i]),
title = "Observed Probabilities"
)
+ pps + yhat +
syst plot_annotation(title = "Simulated Quantities") &
scale_color_viridis(direction = -1) & theme(legend.position = "none")
Now we want to estimate a logit model, i.e., to get back from the observed probabilities \(Y\) and the values of \(X\) to the coefficients \(\beta\) and probabilities \(\pi\).
We start by writing our own logistic response function in R
: \[\pi = \dfrac{\overbrace{\exp(x)}^{\text{always greater than 0}}}{\underbrace{1 + \exp(x)}_{\text{1 + "always greater than 0"}}} \Rightarrow 0 < \pi < 1\].
<- function(x) {
l_response exp(x) / (1 + exp(x))
}
Check if your function works:
head(l_response(beta0 + beta1 * X))
As we want to estimate \(\beta_0\) and \(\beta_1\) using MLE, we need to set up a (log-)likelihood function.
For this, we translate the Log-likelihood of the Logit model from slide 14 to R
.
\[ \ell(\beta|y, X) = \sum_{i=1}^{n} [y_i \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)} + (1-y_i) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)})] \]
This looks messier than it is.
What happens to the log-likelihood function when \(y_i = 0\)?
\[ y_i \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)} + (1-y_i) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \\= \underbrace{0 \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}}_{\text{cancels out}} + \underbrace{(1-0) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)})}_{\text{stays}} \\= \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \]
The opposite is true when \(y_i = 1\):
\[ y_i \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)} + (1-y_i) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \\= \underbrace{1 \cdot \log \frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}}_{\text{stays}} + \underbrace{(1-1) \cdot \log(1-\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)})}_{\text{cancels out}} \\= \log(\frac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}) \] Depending on the value we observe, \(y_i = 0\) or \(y_i = 1\), we receive these reduced parts of the log-likelihood function for each observation and sum across them.
Now we can translate this into R.
Remember that we already have the response function \(\dfrac{\exp(X_{i}\beta)}{1+\exp(X_{i}\beta)}\) implemented as l_response()
.
<- function(X, Y, theta) {
logit_ll <- theta[1]
beta0 <- theta[2]
beta1
<- theta[1] + theta[2] * X
mu
<- sum(Y * log(l_response(mu)) + (1 - Y) * log(1 - l_response(mu)))
logl
return(logl)
}
Thanks to this reparameterization of the linear predictor mu
with the l_response
, we can estimate the theta
as unbounded, yet still ensuring that the predicted probabilities will be within the range of 0 and 1.
Now we maximize it numerically with the help of optimx
.
# Set Starting Values
<- c(0, 0) # 2 values of theta -> 2 starting values
stval
# Optimize
<- optimx(
res par = stval, # start values
fn = logit_ll, # log-likelihood function
Y = Y,
X = X,
control = list(maximize = T)
)## Maximizing -- use negfn and neggr
res## p1 p2 value fevals gevals niter convcode kkt1
## Nelder-Mead 0.2442764 -0.8836574 -608.544 59 NA NA 0 TRUE
## BFGS 0.2441709 -0.8837175 -608.544 44 10 NA 0 TRUE
## kkt2 xtime
## Nelder-Mead TRUE 0.03
## BFGS TRUE 0.03
Luckily, this is all implemented in R
already.
Have a look at the documentation of glm
.
?glm
Let’s make use of it:
<- glm(Y ~ X,
m0 family = binomial(link = logit)
)
summary(m0)
##
## Call:
## glm(formula = Y ~ X, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4003 -1.0666 0.6082 0.9917 1.9966
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.24415 0.06918 3.529 0.000417 ***
## X -0.88379 0.07805 -11.323 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1376.3 on 999 degrees of freedom
## Residual deviance: 1217.1 on 998 degrees of freedom
## AIC: 1221.1
##
## Number of Fisher Scoring iterations: 4
With glm
we can also easily implement a probit model:
<- glm(Y ~ X,
m0_probit family = binomial(link = probit)
)
$coefficients
m0_probit## (Intercept) X
## 0.1463777 -0.5333905
Why do the coefficients differ for logit and probit models? (Slide 15 might help.)
The predicted probability from the logit model would be:
\[\hat\pi_i = \text{Pr}(y_i = 1) = \dfrac{\exp(X_i\hat\beta)}{1 + \exp(X_i\hat\beta)} = \dfrac{\exp(0.24 -0.88\cdot x_i)}{1 + \exp(0.24 -0.88\cdot x_i)}\]
head(l_response(coef(m0)[1] + coef(m0)[2] * X))
## [1] 0.5871943 0.4392744 0.4840094 0.4815857 0.3659704 0.8747143
# head(plogis(coef(m0)[1] + coef(m0)[2] * X)
\[1 - \hat\pi_i = \text{Pr}(y_i = 0) = 1 - \dfrac{\exp(X_i\hat\beta)}{1 + \exp(X_i\hat\beta)} = 1 - \dfrac{\exp(0.24 -0.88\cdot x_i)}{1 + \exp(0.24 -0.88\cdot x_i)}\]
head(1 - l_response(coef(m0)[1] + coef(m0)[2] * X))
## [1] 0.4128057 0.5607256 0.5159906 0.5184143 0.6340296 0.1252857
# head(1 - plogis(coef(m0)[1] + coef(m0)[2] * X)
These values together will sum to 1:
head(l_response(coef(m0)[1] + coef(m0)[2] * X) + (1 - l_response(coef(m0)[1] + coef(m0)[2] * X)))
## [1] 1 1 1 1 1 1
# head(plogis(coef(m0)[1] + coef(m0)[2] * X) +
# (1 - plogis(coef(m0)[1] + coef(m0)[2] * X)))
The predicted probability from the probit model would be:
\[\hat\pi_i = \text{Pr}(y_i = 1) = \Phi(X_i\hat\beta) = \Phi(0.15 - 0.53 \cdot x_i),\] where \(\Phi(\cdot)\) is the CDF of a standard normal distribution \(\mathcal{N}(0,1)\).
head(pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X))
## [1] 0.5838282 0.4410540 0.4842106 0.4818756 0.3697047 0.8793730
\[1 - \hat\pi_i = \text{Pr}(y_i = 0) = 1 - \Phi(X_i\hat\beta) = 1 - \Phi(0.15 - 0.53 \cdot x_i)\]
head(1 - pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X))
## [1] 0.4161718 0.5589460 0.5157894 0.5181244 0.6302953 0.1206270
These values together will sum to 1:
head(pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X) +
1 - pnorm(coef(m0_probit)[1] + coef(m0_probit)[2] * X)))
(## [1] 1 1 1 1 1 1
The coefficients of logit and (even more so) probit models are really hard to interpret beyond their sign. Now our simulation approach, to get meaningful quantities of interest, becomes really helpful.
Let’s start with plotting expected probabilities of our fake data model.
Remember the steps?
Steps for simulating parameters (Estimation Uncertainty):
beta_hat
).V_hat
).N(beta_hat, V_hat)
.nsim
times.# 1. get the coefficients
<- coef(m0)
beta_hat
# 2. Get the variance-covariance matrix
<- vcov(m0)
V_hat
# 3. Set up a multivariate normal distribution N(beta_hat, V_hat)
# 4. Draw from the distribution nsim times
<- 10000
nsim <- mvrnorm(
S n = nsim,
mu = beta_hat,
Sigma = V_hat
)
Next step: set up an interesting scenarios and calculate expected values.
# we simulate over a sequence of x-values
<- seq(min(X), max(X), length.out = 100)
seq_X <- cbind(1, seq_X)
scenario
# linear predictor
<- S %*% t(scenario)
Xbeta
dim(Xbeta)
## [1] 10000 100
So far there is nothing new compared to OLS simulation!
Now comes the difference:
To get expected values for \(p\) (aka predicted probabilities), we need to plug in the \(X_i\beta\) (Xbeta
) values into the response function to get simulated probabilities.
<- (exp(Xbeta)) / (1 + exp(Xbeta))
p_sim
# we can also use our own function:
<- l_response(Xbeta)
p_sim
# or the build-in R-function for logistic distribution function (CDF):
<- plogis(Xbeta) # aka inverse logit
p_sim
dim(p_sim)
## [1] 10000 100
As before, we also want means and quantiles of our quantities of interest.
<- apply(p_sim, 2, mean)
p_mean <- t(apply(p_sim, 2, quantile, prob = c(0.025, 0.975))) p_qu
plot(
x = seq_X,
y = p_mean,
ylim = c(0, 1),
xlim = range(pretty(seq_X)),
type = "n",
main = "Predicted Probabilies of Y",
ylab = "Probability of Y",
xlab = "X",
bty = "n",
las = 1
)
# plot uncertainty with a polygon
polygon(
x = c(rev(seq_X), seq_X),
y = c(rev(p_qu[, 2]), p_qu[, 1]),
col = viridis(1, 0.4),
border = NA
)
# and a line
lines(
x = seq_X,
y = p_mean,
lwd = 2
)
Remember that we know the true data generating process (because we simulated the data). This means that we also know the true probability of Y for each observation. Let’s see how well our estimation (and simulation) worked:
plot(
x = seq_X,
y = p_mean,
ylim = c(0, 1),
xlim = range(pretty(seq_X)),
type = "n",
main = "Predicted Probabilies of Y",
ylab = "Probability of Y",
xlab = "X",
bty = "n",
las = 1
)
# plot uncertainty with a polygon
polygon(
x = c(rev(seq_X), seq_X),
y = c(rev(p_qu[, 2]), p_qu[, 1]),
col = viridis(2, 0.4)[1],
border = NA
)
# and a line
lines(
x = seq_X,
y = p_mean,
lwd = 2
)
# we add the "true" probabilities
# note that these are unobserved in the real world
points(
x = X,
y = p,
pch = 1,
cex = 0.5,
ylim = c(0, 1),
col = viridis(2, 0.4)[2]
)
# data frame for predicted probabilities and CI
<- data.frame(
plot_df "p_mean" = p_mean,
"ci_lo" = p_qu[, 1],
"ci_hi" = p_qu[, 2]
)$seq_X <- seq_X
plot_df
# plot
<- ggplot(data = plot_df, aes(x = seq_X, y = p_mean)) +
pp geom_line() +
geom_ribbon(
aes(ymin = ci_lo, ymax = ci_hi),
fill = viridis(1, 0.5), # add color for filling
color = viridis(1), # add color for lines
linetype = "dashed" # make lines dashed
+
) labs(
x = "X",
y = "Probability of Y",
title = "Predicted Probabilities of Y"
) pp
Remember that we know the true data generating process (because we simulated the data). This means that we also know the true probability of Y for each observation. Let’s see how well our estimation (and simulation) worked:
<- data.frame(X, p)
fake_data + geom_point(
pp data = fake_data,
aes(x = X, y = p),
shape = 1,
color = viridis(2, 0.4)[2]
)
How can we check how good our logit model actually is?
An easy test is to cross-tabulate observed and predicted values of y. For this we classify the predicted probabilities as either 0 or 1. This of course depends on some cut-point. Usually we take a cutoff point of 0.5.
Let’s do this for our model: m0$fitted.values > 0.5
will generate the logical values for whether the fitted value is above or below the threshold TRUE
and FALSE
, and we will multiply them by *1
to make TRUE
equal 1 and FALSE
equal zero.
table(
observed = Y,
predicted = (m0$fitted.values > 0.5) * 1
# or simply round(m0$fitted.values)
) ## predicted
## observed 0 1
## 0 255 195
## 1 138 412
How many correctly predicted? How many falsely predicted?
Let’s make a classification table and calculate the percentage of correctly predicted (aka accuracy) cases (PCP; see slide 25).
<- table(observed = Y, predicted = (m0$fitted.values > 0.5) * 1)
class_table
class_table## predicted
## observed 0 1
## 0 255 195
## 1 138 412
<- sum(diag(class_table)) / sum(class_table)
pcp
pcp## [1] 0.667
Another very good way to asses model fit are separation plots.
Let’s install the package separationlpot
to get started (already done).
Separation plot is a method to visually inspect the fit of a binary model. In this plot, the dark and light panels correspond to the actual instances of the events \((y_i = 1)\) and nonevents \((y_i = 0)\), respectively. Construction of the separation plot begins by simply rearranging the observations such that the fitted values, i.e. the predicted probabilities \(\pi_i\) are presented in ascending order (you can spot it on the black line on the plot). Then one notes whether each of these observations corresponds to an actual instance of the event \((y_i = 1)\) or a nonevent \((y_i = 0)\). These are depicted with the color panels mentioned above. The more nonevents are on the left side and the more events are on the right side, the better the model fit.
Greenhill, Ward, and Sacks (2011) explain this in more detail:
A model with no predictive power-i.e., one whose outcomes can be approximated by a random coin toss—would generate an even distribution of 0s and 1s along the column on the right-hand side. On the other hand, a model with perfect predictive power would produce a complete separation of the 0s and 1s in the right-hand column: low fitted values would always turn out to be associated with actual instances of peace (0s), whereas high fitted values would always be associated with actual instances of war (1s).
A perfect model would show complete separation of the dark and light‐colored rows like this:
The dark line, the respective \(\pi_i\), allows us to judge whether the overall degree of separation between events and nonevents is associated with sharp differences in predicted probabilities, or more modest differences.
Another quantity of interest is the expected number of total events predicted by the model. We can calculate this by simply adding up the predicted probabilities across all observations and rounding the value. This quantity, depicted with a small black triangle on the plot, allows to see how the total number of events predicted by the model compares to the actual number of events in the data.
This is what the separation plot for our fake data would look like:
separationplot(
pred = m0$fitted,
actual = Y,
line = TRUE,
heading = "Separation Plot for Model 0",
show.expected = T,
col0 = viridis(2, alpha = 0.5)[2],
col1 = viridis(2, alpha = 0.5)[1],
lwd2 = 2,
height = 2,
newplot = F
)
If you want to know more about Separation Plots and see more examples of using them, you should read the paper by Greenhill, Ward, and Sacks (2011).
In case you want to keep all plots as ggplot2
objects for some reason (say to keep the figure fonts uniform across the paper), you can transform these base R objects into ggplot2
kind with ggplotify
package:
as.ggplot(~ separationplot(
pred = m0$fitted,
actual = Y,
line = TRUE,
show.expected = T,
col0 = viridis(2)[2],
col1 = viridis(2)[1],
lwd2 = 2,
newplot = F
+
)) labs(title = "Separation Plot for Model 0")
Let’s have a look at some real data!
We will work with the Fearon and Laitin (2003) dataset on ethnicity, insurgency, and civil wars.
First, we estimate the effect of log mountains (log_mountain
) on onset of civil war (civilwar
) using optimx
and the functions from above.
We start with a simple bivariate model.
<- read.dta("raw-data/fearon.dta")
df stargazer(df, type = stargazer_opt)
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
civilwar | 6,191 | 0.017 | 0.129 | 0 | 0 | 0 | 1 |
ethnicwar | 6,191 | 0.011 | 0.106 | 0 | 0 | 0 | 1 |
priorwar | 6,191 | 0.133 | 0.340 | 0 | 0 | 0 | 1 |
gdp_lagged | 6,191 | 3.609 | 4.327 | 0.048 | 0.930 | 4.469 | 53.901 |
log_population | 6,191 | 9.078 | 1.456 | 5.403 | 8.091 | 9.946 | 14.029 |
log_mountain | 6,191 | 2.177 | 1.409 | 0.000 | 0.993 | 3.378 | 4.557 |
noncontiguous | 6,191 | 0.178 | 0.383 | 0 | 0 | 0 | 1 |
oil | 6,191 | 0.128 | 0.334 | 0 | 0 | 0 | 1 |
newstate | 6,191 | 0.026 | 0.160 | 0 | 0 | 0 | 1 |
instability | 6,191 | 0.146 | 0.353 | 0 | 0 | 0 | 1 |
democracy1 | 6,191 | -0.479 | 7.555 | -10 | -7 | 8 | 10 |
ethnicfrac | 6,191 | 0.388 | 0.286 | 0.001 | 0.107 | 0.667 | 0.925 |
relifrac | 6,191 | 0.366 | 0.219 | 0.000 | 0.182 | 0.557 | 0.783 |
anocracy | 6,191 | 0.218 | 0.413 | 0 | 0 | 0 | 1 |
democracy2 | 6,191 | 0.335 | 0.472 | 0 | 0 | 1 | 1 |
<- optimx(stval,
res2
logit_ll,Y = df$civilwar,
X = df$log_mountain,
control = list(maximize = T)
)
## Maximizing -- use negfn and neggr
res2
## p1 p2 value fevals gevals niter convcode kkt1
## Nelder-Mead -4.740625 0.2764533 -521.1480 77 NA NA 0 TRUE
## BFGS -4.742026 0.2765875 -521.1479 40 16 NA 0 TRUE
## kkt2 xtime
## Nelder-Mead TRUE 0.15
## BFGS TRUE 0.15
Now we want to fit the same logit regression as above, but this time using glm
:
<- glm(civilwar ~ log_mountain,
m1 data = df,
family = binomial(link = logit)
)
Let’s see how we did and compare the results to our likelihood function.
$coefficients
m1## (Intercept) log_mountain
## -4.7412093 0.2763476
1:2, 1:2]
res2[## p1 p2
## Nelder-Mead -4.740625 0.2764533
## BFGS -4.742026 0.2765875
We load the data and omit all observations with missing values.
<- read.dta("raw-data/fearon_rep.dta")
df summary(df)
## civilwar ethnicwar priorwar gdp_lagged
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. : 0.048
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.930
## Median :0.00000 Median :0.0000 Median :0.0000 Median : 1.995
## Mean :0.01664 Mean :0.0112 Mean :0.1346 Mean : 3.651
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 4.484
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :66.735
## NA's :1 NA's :1 NA's :237
## log_population log_mountain noncontiguous oil
## Min. : 5.403 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 8.076 1st Qu.:0.9933 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 9.004 Median :2.4248 Median :0.0000 Median :0.0000
## Mean : 9.063 Mean :2.1767 Mean :0.1734 Mean :0.1295
## 3rd Qu.: 9.933 3rd Qu.:3.3393 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :14.029 Max. :4.5570 Max. :1.0000 Max. :1.0000
## NA's :177
## newstate instability democracy1 ethnicfrac
## Min. :0.00000 Min. :0.0000 Min. :-10.0000 Min. :0.0010
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: -7.0000 1st Qu.:0.1073
## Median :0.00000 Median :0.0000 Median : -3.0000 Median :0.3255
## Mean :0.02965 Mean :0.1465 Mean : -0.4799 Mean :0.3854
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.: 8.0000 3rd Qu.:0.6637
## Max. :1.00000 Max. :1.0000 Max. : 10.0000 Max. :0.9250
## NA's :14 NA's :69
## relifrac anocracy democracy2
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1818 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.3576 Median :0.0000 Median :0.0000
## Mean :0.3674 Mean :0.2256 Mean :0.3322
## 3rd Qu.:0.5566 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :0.7828 Max. :1.0000 Max. :1.0000
## NA's :69 NA's :69
<- na.omit(df)
df2 summary(df2)
## civilwar ethnicwar priorwar gdp_lagged
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. : 0.0480
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 0.9295
## Median :0.0000 Median :0.00000 Median :0.0000 Median : 1.9980
## Mean :0.0168 Mean :0.01131 Mean :0.1331 Mean : 3.6085
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.: 4.4690
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :53.9010
## log_population log_mountain noncontiguous oil
## Min. : 5.403 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 8.091 1st Qu.:0.9933 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 9.018 Median :2.4248 Median :0.0000 Median :0.0000
## Mean : 9.078 Mean :2.1772 Mean :0.1782 Mean :0.1279
## 3rd Qu.: 9.946 3rd Qu.:3.3776 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :14.029 Max. :4.5570 Max. :1.0000 Max. :1.0000
## newstate instability democracy1 ethnicfrac
## Min. :0.00000 Min. :0.000 Min. :-10.0000 Min. :0.0010
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.: -7.0000 1st Qu.:0.1073
## Median :0.00000 Median :0.000 Median : -3.0000 Median :0.3331
## Mean :0.02617 Mean :0.146 Mean : -0.4786 Mean :0.3880
## 3rd Qu.:0.00000 3rd Qu.:0.000 3rd Qu.: 8.0000 3rd Qu.:0.6666
## Max. :1.00000 Max. :1.000 Max. : 10.0000 Max. :0.9250
## relifrac anocracy democracy2
## Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.1818 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.3508 Median :0.0000 Median :0.000
## Mean :0.3662 Mean :0.2184 Mean :0.335
## 3rd Qu.:0.5566 3rd Qu.:0.0000 3rd Qu.:1.000
## Max. :0.7828 Max. :1.0000 Max. :1.000
Let’s estimate the model:
<- glm(
m_fearon1 ~
civilwar +
priorwar +
gdp_lagged +
log_population +
log_mountain +
noncontiguous +
oil +
newstate +
instability +
democracy1 +
ethnicfrac
relifrac,data = df,
family = binomial(link = logit)
)
In order to make sense of our results, let’s plot predicted probabilities for the onset of civil war given different levels of log_mountain
, setting all other variables to their mean (continuous)/median (discrete).
Here is the scenario for you:
<-
log_mountain_range seq(min(df$log_mountain, na.rm = T),
max(df$log_mountain, na.rm = T),
length.out = 100
)
<-
scenario cbind(
1, # The Intercept
median(df$priorwar, na.rm = T), # The median of priorwar
mean(df$gdp_lagged, na.rm = T), # The mean of gdp_lagged
mean(df$log_population, na.rm = T), # The mean of log_population
# Our sequence for log_mountain
log_mountain_range, median(df$noncontiguous, na.rm = T), # The median of noncontiguous
median(df$oil, na.rm = T), # The median of oil
median(df$newstate, na.rm = T), # The median of newstate
median(df$instability, na.rm = T), # The median of instability
mean(df$democracy1, na.rm = T), # The mean of democracy1
mean(df$ethnicfrac, na.rm = T), # The mean of ethnicfrac
mean(df$relifrac, na.rm = T) # The mean of relifrac
)
# Have a look at the scenario
head(scenario)
## log_mountain_range
## [1,] 1 0 3.651117 9.062762 0.00000000 0 0 0 0 -0.479896 0.3853547
## [2,] 1 0 3.651117 9.062762 0.04603060 0 0 0 0 -0.479896 0.3853547
## [3,] 1 0 3.651117 9.062762 0.09206121 0 0 0 0 -0.479896 0.3853547
## [4,] 1 0 3.651117 9.062762 0.13809181 0 0 0 0 -0.479896 0.3853547
## [5,] 1 0 3.651117 9.062762 0.18412241 0 0 0 0 -0.479896 0.3853547
## [6,] 1 0 3.651117 9.062762 0.23015302 0 0 0 0 -0.479896 0.3853547
##
## [1,] 0.3673768
## [2,] 0.3673768
## [3,] 0.3673768
## [4,] 0.3673768
## [5,] 0.3673768
## [6,] 0.3673768
# Is everything in the correct order?
coef(m_fearon1)
## (Intercept) priorwar gdp_lagged log_population log_mountain
## -6.57931110 -0.88696614 -0.35034944 0.24925147 0.21925279
## noncontiguous oil newstate instability democracy1
## 0.34317774 0.83918510 1.70935127 0.64393145 0.02424965
## ethnicfrac relifrac
## 0.18804229 0.25036423
Now, it’s your turn.
First, could you describe the scenario in your own words?
Now, you can follow exactly the same steps as above:
beta_hat
).V_hat
).N(beta_hat, V_hat)
.nsim
times.<-
beta_hat <-
V_hat
library(MASS)
<- S
Let’s use the scenario from above.
<- Xbeta
To get expected values for p, we need to plug in the Xbeta
values into the response function to get simulated probabilities.
<- p_sim
Pro Question: How could we include fundamental uncertainty?
Now we also want means and quantiles:
<-
p_mean <- p_qu
This time we instantly plot a polygon and include ticks for actual x-values of our observations:
What can we learn from this plot?
Of course you could (and should!) also unlog the mountains…
# start with an empty plot
plot(
x = exp(log_mountain_range),
y = p_mean,
type = "n",
ylim = c(0, 0.045),
ylab = "Probability of Civil War onset",
xlab = "Mountainous Terrain in %",
bty = "n",
las = 1
)
# draw polygon
polygon(
x = c(rev(exp(log_mountain_range)), exp(log_mountain_range)),
y = c(rev(p_qu[, 2]), p_qu[, 1]),
col = viridis(1, 0.4),
border = NA
)
# add lines
lines(
x = exp(log_mountain_range),
y = p_mean, lwd = 2
)lines(
x = exp(log_mountain_range),
y = p_qu[, 1],
lty = "dashed",
col = viridis(1)
)lines(
x = exp(log_mountain_range),
y = p_qu[, 2],
lty = "dashed",
col = viridis(1)
)
# Adding ticks of actual x-values.
axis(1,
at = exp(df$log_mountain),
col.ticks = viridis(1),
labels = FALSE,
tck = 0.02
)
Again, we want to assess model fit. Remember: simply running a model without testing for model fit is dangerous!
As above, we will look at classification tables and separation plots. Let’s do this for Model 1 from above.
# Expected Values (no fundamental uncertainty)
table(
observed = factor(m_fearon1$model$civilwar, levels = 0:1),
predicted = factor((m_fearon1$fitted.values > 0.5) * 1, levels = 0:1)
)## predicted
## observed 0 1
## 0 6087 0
## 1 104 0
# Predicted Values (including fundamental uncertainty)
table(
observed = factor(m_fearon1$model$civilwar, levels = 0:1),
predicted = factor(rbinom(
n = length(m1$fitted.values),
size = 1,
p = m1$fitted.values
),levels = 0:1
)
)## predicted
## observed 0 1
## 0 5983 104
## 1 104 0
separationplot(m_fearon1$fitted, m_fearon1$model$civilwar,
line = TRUE,
heading = "Fearon/Laitin: Model 1",
show.expected = T,
height = 2,
col0 = viridis(2)[2],
col1 = viridis(2)[1],
lwd2 = 2,
newplot = F
)
Again, we can also make classification tables and calculate the precentage of correctly predicted (aka accuracy) cases (PCP).
<-
class_table1 table(
observed = factor(m_fearon1$model$civilwar, levels = 0:1),
predicted = factor((m_fearon1$fitted.values > 0.5) * 1, levels = 0:1)
)
<- sum(diag(class_table1)) / sum(class_table1)
pcp_1
pcp_1## [1] 0.9832014
Sounds pretty good, doesn’t it?
But how good would a naive model - that is always predicting the majority class - predict civilwars?
That’s the so called Percent of observations in the Modal Category (PMC)
table(m_fearon1$model$civilwar)
##
## 0 1
## 6087 104
max(table(m_fearon1$model$civilwar)) / sum(table(m_fearon1$model$civilwar))
## [1] 0.9832014
That’s exactly the same accuracy…
We can also make our life a little easier and write a function that will calculate the average case scenario for us:
# m_fearon1$model contains the data used to estimate the model
# there are no missing data there, and the variables are in the same order as
# the main terms in the regression equation (but this is not as straightforward
# with models with interactions or cases where we pass character or factor vars
# into the model)
# the first column is the dependent variable
head(m_fearon1$model)
## civilwar priorwar gdp_lagged log_population log_mountain noncontiguous oil
## 1 0 0 7.626 11.85630 3.214868 1 0
## 2 0 0 7.626 11.86313 3.214868 1 0
## 3 0 0 7.654 11.86859 3.214868 1 0
## 4 0 0 8.025 11.88673 3.214868 1 0
## 5 0 0 8.270 11.90488 3.214868 1 0
## 6 0 0 8.040 11.93343 3.214868 1 0
## newstate instability democracy1 ethnicfrac relifrac
## 1 0 0 10 0.3569501 0.596
## 2 0 0 10 0.3569501 0.596
## 3 0 0 10 0.3569501 0.596
## 4 0 0 10 0.3569501 0.596
## 5 0 0 10 0.3569501 0.596
## 6 0 0 10 0.3569501 0.596
<- function(x, rep = 1) {
average_case # if there only two unique values in x and difference between them is 1
# (i.e. it is a dummy variable 0/1, 1/2 or similar)
if (!is.numeric(x)) {
return(paste0("Transform variable to numeric"))
}if (length(unique(x)) == 2 & diff(range(x)) == 1) {
# we take a median
rep(median(x, na.rm = T), rep)
else {
} # otherwise, we take a mean
rep(mean(x, na.rm = T), rep)
}
}# calculate the average for all independent variables
<- cbind(1, apply(m_fearon1$model[, -1], 2, average_case, rep = 100))
scenario # substitute the values in one variable with a sequence
which(colnames(scenario) == "log_mountain")] <-
scenario[, seq(min(df$log_mountain, na.rm = T),
max(df$log_mountain, na.rm = T),
length.out = 100
)