The probability mass function (PMF) and the likelihood function (\(L\)) of a multinomial distribution is
\[ L(\bm{p}) = f(\bm{x}; \bm{p}) = \begin{cases} \frac{n!}{\prod_{i=1}^{k} x_{i}!} \prod_{i=1}^{k} p_{i}^{x_{i}}, & \text{when} \sum_{i=1}^{k} x_{i} = n\\ 0 & \text{otherwise}, \end{cases} \] for non-negative integers \(x_{1}, \dots, x_{k}\).
We study the log-likelihood of this
\[ l(\bm{p}) = \log L(\bm{p}) = \log n! - \sum_{i=1}^{k} \log x_{i}! + \sum_{i=1}^{k} x_{i} \log p_{i}. \]
Considering \(k = 3\) categories and \(n = 100\) random vectors.
probs <- c(.3, .2, .5) # probabilities for each category k
data_generator <- function(n, k = 3, p) {
data <- t(rmultinom(n, 1, prob = p))
return(data)
}
set.seed(1993)
data <- data_generator(n = 100, p = probs)
t(data)[ , 1:15] # each column is a random vector
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
## [1,] 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0
## [3,] 1 1 1 1 0 1 0 0 1 1 0 1 1 0 1
colSums(data) # similar to the fixed probabilities, as expected
## [1] 27 22 51
Writing the log-likelihood function
multinom_lkl <- function(par_init, xs) {
k <- ncol(xs)
# k-1, since the last parameter is taken as the complementary
for (i in 1:(k - 1))
assign(paste0("p", i), par_init[i])
p <- unlist(mget(c(ls(pattern = glob2rx("^p\\d")))))
ps <<- c(ps, p)
lkl_p <- sum(xs[ , -k] %*% log(p)) + sum(xs[ , k] * log(1 - sum(p)))
# normalizing constant
n <- nrow(xs)
nx <- colSums(xs)
logs <- numeric(n)
for (i in 1:n) logs[i] <- log(i)
nx_log <- numeric(k) ; j <- 1
for (i in nx)
nx_log[j] <- sum(logs[1:i]) ; j <- j + 1
lkl <- sum(logs) - sum(nx_log) + lkl_p
return(-lkl)
}
\[ \text{deviance} \equiv D(\bm{p}) = -2~\{l(\bm{p}) - l(\hat{\bm{p}})\} \]
Doing the maximization by different optimization routines:
library(bbmle)
initial_values <- c(.1, .1)
names(initial_values) <- parnames(multinom_lkl) <- c("p1", "p2")
ps <- c()
est_nm <- mle2(multinom_lkl, start = initial_values, vecpar = TRUE,
data = list(xs = data), method = "Nelder-Mead")
trace_nm <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
# estimated probabilities
c(est_nm@coef, p3 = 1 - sum(est_nm@coef))
## p1 p2 p3
## 0.2699814 0.2200481 0.5099704
Equal to the MLEs, \(x_{i}~/~n\). Behold
colSums(data)/nrow(data)
## [1] 0.27 0.22 0.51
initial_values <- c(.1, .1)
names(initial_values) <- parnames(multinom_lkl) <- c("p1", "p2")
ps <- c()
est_bfgs <- mle2(multinom_lkl, start = initial_values, vecpar = TRUE,
data = list(xs = data), method = "BFGS")
trace_bfgs <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
# estimated probabilities
c(est_bfgs@coef, p3 = 1 - sum(est_bfgs@coef))
## p1 p2 p3
## 0.2699995 0.2200006 0.5099999
contour_dev <- function(est, FUN, df,
p1min = .15, p1max = .42, p2min = .10, p2max = .37,
desc = NULL, desc_cex = 1.5,
lab_cex = 1.5, axis_cex = 1.25,
contour_add = FALSE, contour_col = 1, ...) {
multinom_dev <- function(p_grid, par_est, ...) {
return(2 *(FUN(p_grid, ...) - FUN(par_est, ...)))
}
dev_contour <- Vectorize(function(a, b, ...)
multinom_dev(c(a, b), ...))
p1_grid <- seq(p1min, p1max, length.out = 30)
p2_grid <- seq(p2min, p2max, length.out = 30)
outer_grid <- outer(p1_grid, p2_grid, dev_contour,
xs = df, par_est = est, ...)
contour_levels <- c(.99, .95, .9, .8, .7, .5, .3, .1, .05, .01)
contour(p1_grid, p2_grid, outer_grid,
levels = qchisq(contour_levels, df = 2),
labels = contour_levels,
xlab = expression(p[1]), ylab = expression(p[2]),
main = desc, cex.main = desc_cex,
cex.lab = lab_cex, cex.axis = axis_cex,
add = contour_add, col = contour_col)
abline(v = est[1], h = est[2], lty = 2, col = "darkgray")
}
contour_dev_plus <- function(est, method, trace) {
contour_dev(est, FUN = multinom_lkl, df = data,
desc = paste(
"Deviance contour\n(MLEs via", method, "method)"))
for (i in 1:nrow(trace)) text(trace[i, 1], trace[i, 2],
label = i, col = "#0080ff", cex = 1.25)
}
par(mfrow = c(2, 3))
contour_dev_plus(est_nm@coef, method = "Nelder-mead", trace_nm)
contour_dev_plus(est_bfgs@coef, method = "BFGS", trace_bfgs)
# trying different optimization routines from the optimx package =======
library(optimx)
ps <- c()
est_cg <- optimx(c(.1, .1), multinom_lkl, x = data, method = "CG")
trace_cg <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_cg), method = "CG", trace_cg)
ps <- c()
est_nlm <- optimx(c(.1, .1), multinom_lkl, x = data, method = "nlm")
trace_nlm <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_nlm), method = "NLM", trace_nlm)
library(Rcgmin)
ps <- c()
est_rcg <- optimx(c(.1, .1), multinom_lkl, x = data, method = "Rcgmin")
trace_rcg <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_rcg), method = "Rcgmin", trace_rcg)
library(minqa)
ps <- c()
est_boby <- optimx(c(.1, .1), multinom_lkl, x = data, method = "bobyqa")
trace_boby <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_boby), method = "bobyqa", trace_boby)
gradient <- function(par_init, xs) {
x <- colSums(xs)
common_term <- x[3]/(1 - sum(par_init))
gg <- c(x[1:2]/par_init) - common_term
return(-gg)
}
parnames(gradient) <- c("p1", "p2")
necessary_fit <- function(opt, meth) {
initial_values <- c(.1, .1)
names(initial_values) <- parnames(multinom_lkl) <- c("p1", "p2")
est <- mle2(multinom_lkl, start = initial_values, vecpar = TRUE,
data = list(xs = data),
optimizer = opt, method = meth, gr = gradient)
trace_meth <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
return(list(coef = est@coef, trace = trace_meth))
}
par(mfrow = c(2, 3))
# Nelder-Mead ==========================================================
ps <- c()
est_nm <- necessary_fit(opt = "optim", meth = "Nelder-Mead")
contour_dev_plus(est_nm$coef, method = "Nelder-Mead", est_nm$trace)
# BFGS =================================================================
ps <- c()
est_bfgs <- necessary_fit(opt = "optim", meth = "BFGS")
contour_dev_plus(est_bfgs$coef, method = "BFGS", est_bfgs$trace)
# CG ===================================================================
ps <- c()
est_cg <- necessary_fit(opt = "optim", meth = "CG")
contour_dev_plus(est_cg$coef, method = "CG", est_cg$trace)
# NLMINB ===============================================================
ps <- c()
est_nlminb <- necessary_fit(opt = "optimx", meth = "nlminb")
contour_dev_plus(est_nlminb$coef, method = "nlminb", est_nlminb$trace)
# Rcgmin ===============================================================
ps <- c()
est_rcg <- necessary_fit(opt = "optimx", meth = "Rcgmin")
contour_dev_plus(est_rcg$coef, method = "Rcgmin", est_rcg$trace)
# BOBYQA ===============================================================
ps <- c()
est_boby <- necessary_fit(opt = "optimx", meth = "bobyqa")
contour_dev_plus(est_boby$coef, method = "bobyqa", est_boby$trace)
of the log - likelihood
\[ \text{quadratic approx.} \equiv l(\bm{p}) \equiv l(\hat{\bm{p}}) + \frac{1}{2}~ (\bm{p}-\hat{\bm{p}})^{\top}~I_{E}(\hat{\bm{p}})~(\bm{p}-\hat{\bm{p}}), \] with
\[ \begin{align} H_{\text{essian}}(\bm{p}) &= \begin{bmatrix} \partial^{2} l(\bm{p})~/~\partial p_{1}^{2} & \partial^{2} l(\bm{p})~/~\partial p_{1} p_{2} \\ \partial^{2} l(\bm{p})~/~\partial p_{2} p_{1} & \partial^{2} l(\bm{p})~/~\partial p_{2}^{2} \end{bmatrix}\\ &= \begin{bmatrix} -x_{1}~/~p_{1}^{2} - x_{3}~/~(1 - p_{1} - p_{2})^{2} & -x_{3}~/~(1 - p_{1} - p_{2})^{2}\\ -x_{3}~/~(1 - p_{1} - p_{2})^{2} & -x_{2}~/~p_{2}^{2} - x_{3}~/~(1 - p_{1} - p_{2})^{2} \end{bmatrix}\\ I_{O}(\bm{p}) = -H(\bm{p}) &= \begin{bmatrix} x_{1}~/~p_{1}^{2} + x_{3}~/~(1 - p_{1} - p_{2})^{2} & x_{3}~/~(1 - p_{1} - p_{2})^{2}\\ x_{3}~/~(1 - p_{1} - p_{2})^{2} & x_{2}~/~p_{2}^{2} + x_{3}~/~(1 - p_{1} - p_{2})^{2} \end{bmatrix}\\ I_{E}(\bm{p}) = \mathbb{E}\{I_{O}(\bm{p})\} &= \begin{bmatrix} n~/~p_{1} + n~/~p_{3} & n~/~p_{3}\\ n~/~p_{3} & n~/~p_{2} + n~/~p_{3} \end{bmatrix}, \end{align} \] since \(\mathbb{E}\{x_{i}\} = n~p_{i}\).
Thus, the asymptotic variance-covariance matrix is
\[ \begin{align} I_{E}(\bm{p})^{-1} &= \begin{bmatrix} (p_{2} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{2}~p_{3} & -~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{3}\\ -~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{3} & (p_{1} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{1}~p_{3} \end{bmatrix}. \end{align} \]
And the correlation between \(p_{1}\) and \(p_{2}\), \(\rho\), is given by
\[ \begin{align} \rho_{~p_{1}~p_{2}} &= \frac{-~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~ n~p_{3}}{\sqrt{(p_{2} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3}) ~/~n~p_{2}~p_{3}}~\sqrt{(p_{1} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{1}~p_{3}}} \\ &= \frac{-~1~/~p_{3}}{\sqrt{(p_{2} + p_{3})~/~p_{2}~p_{3}}~ \sqrt{(p_{1} + p_{3})~/~p_{1}~p_{3}}} \\ &= -~\frac{\sqrt{p_{2}~p_{3}}~\sqrt{p_{1}~p_{3}}~/~p_{3}}{ \sqrt{p_{2} + p_{3}}~\sqrt{p_{1} + p_{3}}} \\ &= -~\frac{\sqrt{p_{1}~p_{2}}}{ \sqrt{p_{1} + p_{3}}~\sqrt{p_{2} + p_{3}}}. \end{align} \]
multinom_quad <- function(p, xs, model) {
n <- nrow(xs)
off_diag <- n/(1 - sum(model@coef))
Ie <- matrix(c(n/model@coef[1] + off_diag, rep(off_diag, 2),
n/model@coef[2] + off_diag), 2, byrow = TRUE)
quad_approx <- multinom_lkl(par_init = model@coef, xs = xs) -
.5 * (p - model@coef) %*% Ie %*% (p - model@coef)
# returning the negative, since we do the same with the lkl
return(-quad_approx)
}
rho <- function(par_est, xp, yp) {
pars <- par_est@coef
pars <- c(pars, 1 - sum(pars))
for(i in 1:length(pars)) assign(paste0("p", i), pars[i])
rho <- (-1) * sqrt(p1 * p2)/(sqrt(p1 + p3) * sqrt(p2 + p3))
points(p1, p2, pch = 19, cex = .75)
text(xp, yp, cex = 1.5,
labels = substitute(paste(rho[ ~ p[1] ~ p[2]], " = ", r),
list(r = round(rho, 3))))
}
contour_combo <- function(data, level = c("small", "medium", "larger"),
init, p1min, p1max, p2min, p2max,
xp = NA, yp = NA, ...) {
data <- switch(level,
"small" = data,
"medium" = do.call(
rbind, replicate(10, data, simplify = FALSE)),
"larger" = do.call(
rbind, replicate(100, data, simplify = FALSE)))
names(init) <- parnames(multinom_lkl) <- c("p1", "p2")
est_nm <- mle2(multinom_lkl, start = init, data = list(xs = data),
vecpar = TRUE, method = "Nelder-Mead")
contour_dev(est = est_nm@coef, FUN = multinom_lkl, df = data,
p1min, p1max, p2min, p2max,
desc = paste("sample size:", nrow(data)),
desc_cex = 1.5, lab_cex = 1.5, axis_cex = 1.25)
contour_dev(est = est_nm@coef, FUN = multinom_quad, df = data,
model = est_nm, p1min, p1max, p2min, p2max,
contour_add = TRUE, contour_col = 2)
rho(est_nm, xp, yp)
}
par(mfrow = c(2, 3))
set.seed(1994)
data <- data_generator(10, p = probs)
contour_combo(data, level = "small", init = c(.1, .1), xp = .6, yp = .8,
p1min = .03, p1max = .76, p2min = .06, p2max = .83)
contour_combo(data, level = "medium", init = c(.1, .1),
p1min = .03, p1max = .76, p2min = .06, p2max = .83)
contour_combo(data, level = "larger", init = c(.1, .1),
p1min = .03, p1max = .76, p2min = .06, p2max = .83)
set.seed(1998)
data <- data_generator(20, p = c(.05, .3, .65))
contour_combo(data, level = "small", init = c(.1, .1), xp = .26, yp = .39,
p1min = 0, p1max = .33, p2min = 0, p2max = .41)
contour_combo(data, level = "medium", init = c(.1, .1),
p1min = 0, p1max = .33, p2min = 0, p2max = .41)
contour_combo(data, level = "larger", init = c(.1, .1),
p1min = 0, p1max = .33, p2min = 0, p2max = .41)
log-likelihood profiles
profile_plot <- function(data, level = c("small", "medium"), p) {
data <- switch(level,
"small" = data,
"medium" = do.call(
rbind, replicate(10, data, simplify = FALSE)))
init <- c(.1, .1)
names(init) <- parnames(multinom_lkl) <- c("p1", "p2")
model <- mle2(multinom_lkl, start = init, vecpar = TRUE,
data = list(xs = data), method = "Nelder-Mead")
if (nrow(data) != 20 & level != "small" & p != 1)
perf <- profile(model, which = p)
else perf <- profile(model, which = p, zmax = 2)
label <- ifelse(p == 1, expression(p[1]), expression(p[2]))
plot(perf, col.minval = "darkgray", col.prof = 1, xlab = label,
main = paste("sample size:", nrow(data)),
cex.main = 1.5, cex.lab = 1.5, cex.axis = 1.25)
}
par(mfrow = c(2, 4), mar = c(4, 4, 3, 2) + .1)
set.seed(1994)
data <- data_generator(n = 10, p = probs)
for(l in c("small", "medium"))
for (i in 1:2)
profile_plot(data, level = l, p = i)
set.seed(1998)
data <- data_generator(20, p = c(.05, .3, .65))
for(l in c("small", "medium"))
for (i in 1:2)
profile_plot(data, level = l, p = i)
log - likelihood,
\[ l(\bm{\beta}) = \sum_{j=1}^{n} \log y_{j}! - \sum_{j=1}^{n} \sum_{i=1}^{k} y_{ij}! + \sum_{j=1}^{n} \sum_{i=1}^{k} y_{ij} \log p_{ij}, \quad \text{with} \quad p_{ij} = \frac{ \exp\{\bm{x}_{j}^{\top} \bm{\beta}_{i}\}}{ 1 + \sum_{i=1}^{k-1} \exp\{\bm{x}_{j}^{\top} \bm{\beta}_{i}\}}. \]
with
\(y\) representing the outcome, i.e., an indicator for which category the subject \(j\) belongs.
\(i\) represents the multinomial category/level.
\(p_{ij}\) is the probability of the subject \(j\) be classified in the category \(i\).
\(\bm{\beta}_{i}\) is the vector of coefficients for category \(i\)
and \(\bm{x}_{j}\) is the row vector of covariates of the subject \(j\).
\(y_{ij}\) and \(y_{j}\) are countings, expressing the observed quantities for the subject \(j\) in the category \(i\).
Here we’re simulating data in the fashion of a multinomial bernoulli trial, say.
Thus, these countings will be always one. As consequence, \(\log 1! = 0\).
# ======================================================================
# inverse multinomial logit function (the heart of everything here) ====
# ======================================================================
inv_logit <- function(coefs, preds, data) {
# building objects -------------------------------------------------
ind_pred <- seq(preds)
n_pred <- length(preds)
k <- n_pred + 1
betas <- split(coefs, substr(names(coefs), start = 3, stop = 3))
list_X <- lapply(preds, model.matrix, data = data)
# keeping here to maybe use in the short future --------------------
## list_model_frame <- lapply(linear_pred, model.frame, data = data)
## list_Y <- lapply(list_model_frame, model.response)
## y_vec <- as.numeric(do.call(c, list_Y))
# computing the important stuff ------------------------------------
exp_xb <- sapply(seq(list_X),
function(i) exp(list_X[[i]] %*% betas[[i]]))
link_denominator <- 1 + rowSums(exp_xb)
ps <- sapply(ind_pred, function(i) exp_xb[ , i]/link_denominator)
ps <- cbind(ps, 1 - rowSums(ps))
colnames(ps) <- paste0("p", seq(k))
# outputs ----------------------------------------------------------
return(list(y = data[ , seq(k)], ps = ps))
}
# defining sample size and number of categories ========================
n <- 100
k <- 3
# ======================================================================
# simulating data ======================================================
# ======================================================================
naive_data <- function(n, k) {
data <- as.data.frame(matrix(0, nrow = n, ncol = k))
names(data) <- paste0("y", 1:k)
return(data)
}
# to do a different set, change the seed ===============================
set.seed(0080)
data <- naive_data(n, k)
# covariates ===========================================================
data$x1 <- rnorm(n, mean = 5, sd = 1)
data$x2 <- rnorm(n, mean = 1, sd = 2)
# linear predictor and initial guesses =================================
linear_pred <- list(y1 ~ x1 + x2, y2 ~ x1 + x2)
initial_values <- c("b01" = .4, "b11" = .1, "b21" = - .3,
"b02" = .2, "b12" = .5, "b22" = - .6)
## initial_values <- list(c("b01" = .4, "b11" = .1, "b21" = - .3),
## c("b02" = .2, "b12" = .5, "b22" = - .6))
# probs based in the covs to be used to generate the responses =========
probs <- inv_logit(initial_values, preds = linear_pred, data = data)$ps
summary(probs)
## p1 p2 p3
## Min. :0.02625 Min. :0.1761 Min. :0.00374
## 1st Qu.:0.09887 1st Qu.:0.6028 1st Qu.:0.03496
## Median :0.16629 Median :0.7454 Median :0.09148
## Mean :0.16406 Mean :0.7111 Mean :0.12488
## 3rd Qu.:0.21648 3rd Qu.:0.8647 3rd Qu.:0.17850
## Max. :0.32442 Max. :0.9700 Max. :0.59687
# loading library ======================================================
library(mc2d) # vectorized versions of {r, d}multinom
# finally, simulating the multinomial data =============================
data[ , seq(k)] <- rmultinomial(n, 1, prob = probs)
colSums(data[ , seq(k)]) # as expected, close to the ``probs`` means
## y1 y2 y3
## 14 70 16
# ======================================================================
# multinomial likelihood ===============================================
# ======================================================================
multi_lkl <- function(initial_values, linear_pred, data) {
ilogit <- inv_logit(initial_values, preds = linear_pred, data = data)
lkl <- with(ilogit, sum(y * log(ps)))
## lkl <- with(ilogit,
## sum(dmultinomial(y, size = 1, prob = ps, log = TRUE)))
return(-lkl)
}
# loading library ======================================================
library(bbmle)
parnames(multi_lkl) <- names(initial_values) # mle2 exigency
# model fitting ========================================================
(fit <- mle2(multi_lkl, start = initial_values,
data = list(linear_pred = linear_pred, data = data)))
##
## Call:
## mle2(minuslogl = multi_lkl, start = initial_values, data = list(linear_pred = linear_pred,
## data = data))
##
## Coefficients:
## b01 b11 b21 b02 b12 b22
## 1.71609801 -0.26076624 -0.42126060 0.05884933 0.43274118 -0.49420132
##
## Log-likelihood: -72.97
# behold the estimated probabilities ===================================
results <- lapply(
seq(k),
function(i) cbind(inv_logit(fit@coef, linear_pred, data)$ps[ , i],
probs[ , i]))
results <- do.call(cbind, results)
colnames(results) <- c(sapply(seq(k),
function(i) c(paste0("p", i, "_est"),
paste0("p", i, "_true"))))
head(results, 10)
## p1_est p1_true p2_est p2_true p3_est p3_true
## [1,] 0.07362905 0.21727022 0.2554734 0.1858614 0.67089756 0.59686839
## [2,] 0.08184015 0.18000619 0.7122206 0.6689320 0.20593929 0.15106185
## [3,] 0.11227949 0.16062883 0.7540724 0.7474396 0.13364814 0.09193157
## [4,] 0.09294784 0.07730113 0.8717613 0.9040967 0.03529081 0.01860213
## [5,] 0.11314446 0.18527656 0.7044381 0.6819218 0.18241740 0.13280162
## [6,] 0.15454264 0.21134961 0.6415401 0.6329376 0.20391725 0.15571276
## [7,] 0.12081038 0.08890887 0.8393728 0.8891595 0.03981685 0.02193163
## [8,] 0.12135320 0.24683283 0.4764948 0.4165770 0.40215199 0.33659012
## [9,] 0.16782814 0.15426992 0.7362683 0.7808901 0.09590359 0.06483995
## [10,] 0.12213469 0.11892142 0.8087069 0.8387032 0.06915837 0.04237537
# ======================================================================
# checking the results with the existing r libraries ===================
# ======================================================================
# 4 these libs is better if we put the data in the long format =========
library(reshape2)
data_long <- melt(data, measure.vars = seq(k), variable.name = "y")
data_long <- data_long[data_long$value == 1, ]
library(nnet) # --------------------------------------------------------
data_long$y <- relevel(data_long$y, ref = k) # changing the ref. level
# behold, the estimated coefficients are the same ----------------------
(fit_nnet <- multinom(y ~ x1 + x2, data_long))
## # weights: 12 (6 variable)
## initial value 109.861229
## iter 10 value 72.972199
## final value 72.971664
## converged
## Call:
## multinom(formula = y ~ x1 + x2, data = data_long)
##
## Coefficients:
## (Intercept) x1 x2
## y1 1.71593734 -0.2607501 -0.4212285
## y2 0.05886355 0.4327226 -0.4941669
##
## Residual Deviance: 145.9433
## AIC: 157.9433
head(fitted(fit_nnet)) # estimated probabilities, behold
## y3 y1 y2
## 4 0.03529560 0.09294889 0.8717555
## 5 0.18241968 0.11314438 0.7044359
## 9 0.09591009 0.16782449 0.7362654
## 12 0.13707584 0.24115786 0.6217663
## 16 0.08257656 0.09022618 0.8271973
## 24 0.37499417 0.13547364 0.4895322
library(mlogit) # ------------------------------------------------------
fit_mlogit <- mlogit(y ~ 0 | x1 + x2, reflevel = 3,
shape = "wide", data_long)
# behold, the estimated coefficients are the same ----------------------
matrix(fit_mlogit$coef, nrow = 2, ncol = 3)
## [,1] [,2] [,3]
## [1,] 1.71603345 -0.2607527 -0.4212572
## [2,] 0.05885441 0.4327380 -0.4941973
When we look to the estimated probabilities through the packages nnet
and mlogit
they’re a little strange, due to your confusing order of presentation. However, by the subject id number we can see that the probabilities are equal to the ones found by our implementation.
Another interesting thing (that I saw only now, after doing that), is that the package mlogit
uses the nnet
package as its implementation algorithm, say.