\(\newcommand{\bm}[1]{\boldsymbol{\mathbf{#1}}}\) logo

The probability mass function (PMF) and the likelihood function (\(L\)) of a multinomial distribution is

\[ L(\bm{p}) = f(\bm{x}; \bm{p}) = \begin{cases} \frac{n!}{\prod_{i=1}^{k} x_{i}!} \prod_{i=1}^{k} p_{i}^{x_{i}}, & \text{when} \sum_{i=1}^{k} x_{i} = n\\ 0 & \text{otherwise}, \end{cases} \] for non-negative integers \(x_{1}, \dots, x_{k}\).

We study the log-likelihood of this

\[ l(\bm{p}) = \log L(\bm{p}) = \log n! - \sum_{i=1}^{k} \log x_{i}! + \sum_{i=1}^{k} x_{i} \log p_{i}. \]

Simulating data

Considering \(k = 3\) categories and \(n = 100\) random vectors.

probs <- c(.3, .2, .5) # probabilities for each category k
data_generator <- function(n, k = 3, p) {
    data <- t(rmultinom(n, 1, prob = p))
    return(data)
}
set.seed(1993)
data <- data_generator(n = 100, p = probs)
t(data)[ , 1:15] # each column is a random vector
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
## [1,]    0    0    0    0    1    0    1    0    0     0     0     0     0     0     0
## [2,]    0    0    0    0    0    0    0    1    0     0     1     0     0     1     0
## [3,]    1    1    1    1    0    1    0    0    1     1     0     1     1     0     1
colSums(data) # similar to the fixed probabilities, as expected
## [1] 27 22 51

MLE (without covariates)

Writing the log-likelihood function

multinom_lkl <- function(par_init, xs) {
    k <- ncol(xs)
    # k-1, since the last parameter is taken as the complementary
    for (i in 1:(k - 1))
        assign(paste0("p", i), par_init[i])
    p <- unlist(mget(c(ls(pattern = glob2rx("^p\\d")))))
    ps <<- c(ps, p)
    lkl_p <- sum(xs[ , -k] %*% log(p)) + sum(xs[ , k] * log(1 - sum(p)))
    # normalizing constant
    n <- nrow(xs)
    nx <- colSums(xs)
    logs <- numeric(n)
    for (i in 1:n) logs[i] <- log(i)
    nx_log <- numeric(k) ; j <- 1
    for (i in nx)
        nx_log[j] <- sum(logs[1:i]) ; j <- j + 1
    lkl <- sum(logs) - sum(nx_log) + lkl_p
    return(-lkl)
}

Maximization, and deviance contour

\[ \text{deviance} \equiv D(\bm{p}) = -2~\{l(\bm{p}) - l(\hat{\bm{p}})\} \]

Doing the maximization by different optimization routines:

optim method: Nelder-Mead

library(bbmle)

initial_values <- c(.1, .1)
names(initial_values) <- parnames(multinom_lkl) <- c("p1", "p2")
ps <- c()
est_nm <- mle2(multinom_lkl, start = initial_values, vecpar = TRUE,
               data = list(xs = data), method = "Nelder-Mead")
trace_nm <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
# estimated probabilities
c(est_nm@coef, p3 = 1 - sum(est_nm@coef))
##        p1        p2        p3 
## 0.2699814 0.2200481 0.5099704

Equal to the MLEs, \(x_{i}~/~n\). Behold

colSums(data)/nrow(data)
## [1] 0.27 0.22 0.51

optim method: BFGS

initial_values <- c(.1, .1)
names(initial_values) <- parnames(multinom_lkl) <- c("p1", "p2")
ps <- c()
est_bfgs <- mle2(multinom_lkl, start = initial_values, vecpar = TRUE,
                 data = list(xs = data), method = "BFGS")
trace_bfgs <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
# estimated probabilities
c(est_bfgs@coef, p3 = 1 - sum(est_bfgs@coef))
##        p1        p2        p3 
## 0.2699995 0.2200006 0.5099999

Deviance contours (+ more methods)

contour_dev <- function(est, FUN, df,
                        p1min = .15, p1max = .42, p2min = .10, p2max = .37,
                        desc = NULL, desc_cex = 1.5,
                        lab_cex = 1.5, axis_cex = 1.25,
                        contour_add = FALSE, contour_col = 1, ...) {
    multinom_dev <- function(p_grid, par_est, ...) {
        return(2 *(FUN(p_grid, ...) - FUN(par_est, ...)))
    }
    dev_contour <- Vectorize(function(a, b, ...)
        multinom_dev(c(a, b), ...))
    p1_grid <- seq(p1min, p1max, length.out = 30)
    p2_grid <- seq(p2min, p2max, length.out = 30)
    outer_grid <- outer(p1_grid, p2_grid, dev_contour,
                        xs = df, par_est = est, ...)
    contour_levels <- c(.99, .95, .9, .8, .7, .5, .3, .1, .05, .01)
    contour(p1_grid, p2_grid, outer_grid,
            levels = qchisq(contour_levels, df = 2),
            labels = contour_levels,
            xlab = expression(p[1]), ylab = expression(p[2]),
            main = desc, cex.main = desc_cex,
            cex.lab = lab_cex, cex.axis = axis_cex,
            add = contour_add, col = contour_col)
    abline(v = est[1], h = est[2], lty = 2, col = "darkgray")
}
contour_dev_plus <- function(est, method, trace) {
    contour_dev(est, FUN = multinom_lkl, df = data,
                desc = paste(
                    "Deviance contour\n(MLEs via", method, "method)"))
    for (i in 1:nrow(trace)) text(trace[i, 1], trace[i, 2],
                                  label = i, col = "#0080ff", cex = 1.25)
}
par(mfrow = c(2, 3))
contour_dev_plus(est_nm@coef, method = "Nelder-mead", trace_nm)
contour_dev_plus(est_bfgs@coef, method = "BFGS", trace_bfgs)
# trying different optimization routines from the optimx package =======
library(optimx)
ps <- c()
est_cg <- optimx(c(.1, .1), multinom_lkl, x = data, method = "CG")
trace_cg <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_cg), method = "CG", trace_cg)
ps <- c()
est_nlm <- optimx(c(.1, .1), multinom_lkl, x = data, method = "nlm")
trace_nlm <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_nlm), method = "NLM", trace_nlm)
library(Rcgmin)
ps <- c()
est_rcg <- optimx(c(.1, .1), multinom_lkl, x = data, method = "Rcgmin")
trace_rcg <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_rcg), method = "Rcgmin", trace_rcg)
library(minqa)
ps <- c()
est_boby <- optimx(c(.1, .1), multinom_lkl, x = data, method = "bobyqa")
trace_boby <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
contour_dev_plus(coef(est_boby), method = "bobyqa", trace_boby)
In blue, the coefficient estimates at each function evaluation.

In blue, the coefficient estimates at each function evaluation.

Deviance contours (with my gradient)

gradient <- function(par_init, xs) {
    x <- colSums(xs)
    common_term <- x[3]/(1 - sum(par_init))
    gg <- c(x[1:2]/par_init) - common_term
    return(-gg)
}
parnames(gradient) <- c("p1", "p2")
necessary_fit <- function(opt, meth) {
    initial_values <- c(.1, .1)
    names(initial_values) <- parnames(multinom_lkl) <- c("p1", "p2")
    est <- mle2(multinom_lkl, start = initial_values, vecpar = TRUE,
                data = list(xs = data),
                optimizer = opt, method = meth, gr = gradient)
    trace_meth <- matrix(ps, ncol = ncol(data) - 1, byrow = TRUE)
    return(list(coef = est@coef, trace = trace_meth))
}
par(mfrow = c(2, 3))
# Nelder-Mead ==========================================================
ps <- c()
est_nm <- necessary_fit(opt = "optim", meth = "Nelder-Mead")
contour_dev_plus(est_nm$coef, method = "Nelder-Mead", est_nm$trace)
# BFGS =================================================================
ps <- c()
est_bfgs <- necessary_fit(opt = "optim", meth = "BFGS")
contour_dev_plus(est_bfgs$coef, method = "BFGS", est_bfgs$trace)
# CG ===================================================================
ps <- c()
est_cg <- necessary_fit(opt = "optim", meth = "CG")
contour_dev_plus(est_cg$coef, method = "CG", est_cg$trace)
# NLMINB ===============================================================
ps <- c()
est_nlminb <- necessary_fit(opt = "optimx", meth = "nlminb")
contour_dev_plus(est_nlminb$coef, method = "nlminb", est_nlminb$trace)
# Rcgmin ===============================================================
ps <- c()
est_rcg <- necessary_fit(opt = "optimx", meth = "Rcgmin")
contour_dev_plus(est_rcg$coef, method = "Rcgmin", est_rcg$trace)
# BOBYQA ===============================================================
ps <- c()
est_boby <- necessary_fit(opt = "optimx", meth = "bobyqa")
contour_dev_plus(est_boby$coef, method = "bobyqa", est_boby$trace)

Quadratic approximation

of the log - likelihood

\[ \text{quadratic approx.} \equiv l(\bm{p}) \equiv l(\hat{\bm{p}}) + \frac{1}{2}~ (\bm{p}-\hat{\bm{p}})^{\top}~I_{E}(\hat{\bm{p}})~(\bm{p}-\hat{\bm{p}}), \] with

\[ \begin{align} H_{\text{essian}}(\bm{p}) &= \begin{bmatrix} \partial^{2} l(\bm{p})~/~\partial p_{1}^{2} & \partial^{2} l(\bm{p})~/~\partial p_{1} p_{2} \\ \partial^{2} l(\bm{p})~/~\partial p_{2} p_{1} & \partial^{2} l(\bm{p})~/~\partial p_{2}^{2} \end{bmatrix}\\ &= \begin{bmatrix} -x_{1}~/~p_{1}^{2} - x_{3}~/~(1 - p_{1} - p_{2})^{2} & -x_{3}~/~(1 - p_{1} - p_{2})^{2}\\ -x_{3}~/~(1 - p_{1} - p_{2})^{2} & -x_{2}~/~p_{2}^{2} - x_{3}~/~(1 - p_{1} - p_{2})^{2} \end{bmatrix}\\ I_{O}(\bm{p}) = -H(\bm{p}) &= \begin{bmatrix} x_{1}~/~p_{1}^{2} + x_{3}~/~(1 - p_{1} - p_{2})^{2} & x_{3}~/~(1 - p_{1} - p_{2})^{2}\\ x_{3}~/~(1 - p_{1} - p_{2})^{2} & x_{2}~/~p_{2}^{2} + x_{3}~/~(1 - p_{1} - p_{2})^{2} \end{bmatrix}\\ I_{E}(\bm{p}) = \mathbb{E}\{I_{O}(\bm{p})\} &= \begin{bmatrix} n~/~p_{1} + n~/~p_{3} & n~/~p_{3}\\ n~/~p_{3} & n~/~p_{2} + n~/~p_{3} \end{bmatrix}, \end{align} \] since \(\mathbb{E}\{x_{i}\} = n~p_{i}\).

Thus, the asymptotic variance-covariance matrix is

\[ \begin{align} I_{E}(\bm{p})^{-1} &= \begin{bmatrix} (p_{2} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{2}~p_{3} & -~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{3}\\ -~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{3} & (p_{1} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{1}~p_{3} \end{bmatrix}. \end{align} \]

And the correlation between \(p_{1}\) and \(p_{2}\), \(\rho\), is given by

\[ \begin{align} \rho_{~p_{1}~p_{2}} &= \frac{-~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~ n~p_{3}}{\sqrt{(p_{2} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3}) ~/~n~p_{2}~p_{3}}~\sqrt{(p_{1} + p_{3})~(p_{1}~p_{2} + p_{1}~p_{3} + p_{2}~p_{3})~/~n~p_{1}~p_{3}}} \\ &= \frac{-~1~/~p_{3}}{\sqrt{(p_{2} + p_{3})~/~p_{2}~p_{3}}~ \sqrt{(p_{1} + p_{3})~/~p_{1}~p_{3}}} \\ &= -~\frac{\sqrt{p_{2}~p_{3}}~\sqrt{p_{1}~p_{3}}~/~p_{3}}{ \sqrt{p_{2} + p_{3}}~\sqrt{p_{1} + p_{3}}} \\ &= -~\frac{\sqrt{p_{1}~p_{2}}}{ \sqrt{p_{1} + p_{3}}~\sqrt{p_{2} + p_{3}}}. \end{align} \]

multinom_quad <- function(p, xs, model) {
    n <- nrow(xs)
    off_diag <- n/(1 - sum(model@coef))
    Ie <- matrix(c(n/model@coef[1] + off_diag, rep(off_diag, 2),
                   n/model@coef[2] + off_diag), 2, byrow = TRUE)
        quad_approx <- multinom_lkl(par_init = model@coef, xs = xs) -
        .5 * (p - model@coef) %*% Ie %*% (p - model@coef)
    # returning the negative, since we do the same with the lkl
    return(-quad_approx)
}
rho <- function(par_est, xp, yp) {
    pars <- par_est@coef
    pars <- c(pars, 1 - sum(pars))
    for(i in 1:length(pars)) assign(paste0("p", i), pars[i])
    rho <- (-1) * sqrt(p1 * p2)/(sqrt(p1 + p3) * sqrt(p2 + p3))
    points(p1, p2, pch = 19, cex = .75)
    text(xp, yp, cex = 1.5,
         labels = substitute(paste(rho[ ~ p[1] ~ p[2]], " = ", r),
                             list(r = round(rho, 3))))
}
contour_combo <- function(data, level = c("small", "medium", "larger"),
                          init, p1min, p1max, p2min, p2max,
                          xp = NA, yp = NA, ...) {
    data <- switch(level,
                   "small" = data,
                   "medium" = do.call(
                       rbind, replicate(10, data, simplify = FALSE)),
                   "larger" = do.call(
                       rbind, replicate(100, data, simplify = FALSE)))
    names(init) <- parnames(multinom_lkl) <- c("p1", "p2")
    est_nm <- mle2(multinom_lkl, start = init, data = list(xs = data),
                   vecpar = TRUE, method = "Nelder-Mead")
    contour_dev(est = est_nm@coef, FUN = multinom_lkl, df = data,
                p1min, p1max, p2min, p2max,
                desc = paste("sample size:", nrow(data)),
                desc_cex = 1.5, lab_cex = 1.5, axis_cex = 1.25)
    contour_dev(est = est_nm@coef, FUN = multinom_quad, df = data,
                model = est_nm, p1min, p1max, p2min, p2max,
                contour_add = TRUE, contour_col = 2)
    rho(est_nm, xp, yp)
}
par(mfrow = c(2, 3))
set.seed(1994)
data <- data_generator(10, p = probs)
contour_combo(data, level = "small", init = c(.1, .1), xp = .6, yp = .8,
              p1min = .03, p1max = .76, p2min = .06, p2max = .83)
contour_combo(data, level = "medium", init = c(.1, .1),
              p1min = .03, p1max = .76, p2min = .06, p2max = .83)
contour_combo(data, level = "larger", init = c(.1, .1),
              p1min = .03, p1max = .76, p2min = .06, p2max = .83)
set.seed(1998)
data <- data_generator(20, p = c(.05, .3, .65))
contour_combo(data, level = "small", init = c(.1, .1), xp = .26, yp = .39,
              p1min = 0, p1max = .33, p2min = 0, p2max = .41)
contour_combo(data, level = "medium", init = c(.1, .1),
              p1min = 0, p1max = .33, p2min = 0, p2max = .41)
contour_combo(data, level = "larger", init = c(.1, .1),
              p1min = 0, p1max = .33, p2min = 0, p2max = .41)

Likelihood profiles

log-likelihood profiles

profile_plot <- function(data, level = c("small", "medium"), p) {
    data <- switch(level,
                   "small" = data,
                   "medium" = do.call(
                       rbind, replicate(10, data, simplify = FALSE)))
    init <- c(.1, .1)
    names(init) <- parnames(multinom_lkl) <- c("p1", "p2")
    model <- mle2(multinom_lkl, start = init, vecpar = TRUE,
                  data = list(xs = data), method = "Nelder-Mead")
    if (nrow(data) != 20 & level != "small" & p != 1)
        perf <- profile(model, which = p)
    else perf <- profile(model, which = p, zmax = 2)
    label <- ifelse(p == 1, expression(p[1]), expression(p[2]))
    plot(perf, col.minval = "darkgray", col.prof = 1, xlab = label,
         main = paste("sample size:", nrow(data)),
         cex.main = 1.5, cex.lab = 1.5, cex.axis = 1.25)
}
par(mfrow = c(2, 4), mar = c(4, 4, 3, 2) + .1)
set.seed(1994)
data <- data_generator(n = 10, p = probs)
for(l in c("small", "medium"))
    for (i in 1:2)
        profile_plot(data, level = l, p = i)
set.seed(1998)
data <- data_generator(20, p = c(.05, .3, .65))
for(l in c("small", "medium"))
    for (i in 1:2)
        profile_plot(data, level = l, p = i)

And now with covariates

log - likelihood,

\[ l(\bm{\beta}) = \sum_{j=1}^{n} \log y_{j}! - \sum_{j=1}^{n} \sum_{i=1}^{k} y_{ij}! + \sum_{j=1}^{n} \sum_{i=1}^{k} y_{ij} \log p_{ij}, \quad \text{with} \quad p_{ij} = \frac{ \exp\{\bm{x}_{j}^{\top} \bm{\beta}_{i}\}}{ 1 + \sum_{i=1}^{k-1} \exp\{\bm{x}_{j}^{\top} \bm{\beta}_{i}\}}. \]

with

  • \(y\) representing the outcome, i.e., an indicator for which category the subject \(j\) belongs.

  • \(i\) represents the multinomial category/level.

  • \(p_{ij}\) is the probability of the subject \(j\) be classified in the category \(i\).

  • \(\bm{\beta}_{i}\) is the vector of coefficients for category \(i\)

  • and \(\bm{x}_{j}\) is the row vector of covariates of the subject \(j\).

\(y_{ij}\) and \(y_{j}\) are countings, expressing the observed quantities for the subject \(j\) in the category \(i\).

Here we’re simulating data in the fashion of a multinomial bernoulli trial, say.

Thus, these countings will be always one. As consequence, \(\log 1! = 0\).

# ======================================================================
# inverse multinomial logit function (the heart of everything here) ====
# ======================================================================
inv_logit <- function(coefs, preds, data) {
    # building objects -------------------------------------------------
    ind_pred <- seq(preds)
    n_pred <- length(preds)
    k <- n_pred + 1
    betas <- split(coefs, substr(names(coefs), start = 3, stop = 3))
    list_X <- lapply(preds, model.matrix, data = data)
    # keeping here to maybe use in the short future --------------------
    ## list_model_frame <- lapply(linear_pred, model.frame, data = data)
    ## list_Y <- lapply(list_model_frame, model.response)
    ## y_vec <- as.numeric(do.call(c, list_Y))
    # computing the important stuff ------------------------------------
    exp_xb <- sapply(seq(list_X),
                     function(i) exp(list_X[[i]] %*% betas[[i]]))
    link_denominator <- 1 + rowSums(exp_xb)
    ps <- sapply(ind_pred, function(i) exp_xb[ , i]/link_denominator)
    ps <- cbind(ps, 1 - rowSums(ps))
    colnames(ps) <- paste0("p", seq(k))
    # outputs ----------------------------------------------------------
    return(list(y = data[ , seq(k)], ps = ps))
}
# defining sample size and number of categories ========================
n <- 100
k <- 3
# ======================================================================
# simulating data ======================================================
# ======================================================================
naive_data <- function(n, k) {
    data <- as.data.frame(matrix(0, nrow = n, ncol = k))
    names(data) <- paste0("y", 1:k)
    return(data)
}
# to do a different set, change the seed ===============================
set.seed(0080)
data <- naive_data(n, k)
# covariates ===========================================================
data$x1 <- rnorm(n, mean =  5, sd =  1)
data$x2 <- rnorm(n, mean =  1, sd =  2)
# linear predictor and initial guesses =================================
linear_pred <- list(y1 ~ x1 + x2, y2 ~ x1 + x2)
initial_values <- c("b01" = .4, "b11" = .1, "b21" = - .3,
                    "b02" = .2, "b12" = .5, "b22" = - .6)
## initial_values <- list(c("b01" = .4, "b11" = .1, "b21" = - .3),
##                        c("b02" = .2, "b12" = .5, "b22" = - .6))
# probs based in the covs to be used to generate the responses =========
probs <- inv_logit(initial_values, preds = linear_pred, data = data)$ps
summary(probs)
##        p1                p2               p3         
##  Min.   :0.02625   Min.   :0.1761   Min.   :0.00374  
##  1st Qu.:0.09887   1st Qu.:0.6028   1st Qu.:0.03496  
##  Median :0.16629   Median :0.7454   Median :0.09148  
##  Mean   :0.16406   Mean   :0.7111   Mean   :0.12488  
##  3rd Qu.:0.21648   3rd Qu.:0.8647   3rd Qu.:0.17850  
##  Max.   :0.32442   Max.   :0.9700   Max.   :0.59687
# loading library ======================================================
library(mc2d) # vectorized versions of {r, d}multinom
# finally, simulating the multinomial data =============================
data[ , seq(k)] <- rmultinomial(n, 1, prob = probs)
colSums(data[ , seq(k)]) # as expected, close to the ``probs`` means
## y1 y2 y3 
## 14 70 16
# ======================================================================
# multinomial likelihood ===============================================
# ======================================================================
multi_lkl <- function(initial_values, linear_pred, data) {
    ilogit <- inv_logit(initial_values, preds = linear_pred, data = data)
    lkl <- with(ilogit, sum(y * log(ps)))
    ## lkl <- with(ilogit,
    ##             sum(dmultinomial(y, size = 1, prob = ps, log = TRUE)))
    return(-lkl)
}
# loading library ======================================================
library(bbmle)
parnames(multi_lkl) <- names(initial_values) # mle2 exigency
# model fitting ========================================================
(fit <- mle2(multi_lkl, start = initial_values,
             data = list(linear_pred = linear_pred, data = data)))
## 
## Call:
## mle2(minuslogl = multi_lkl, start = initial_values, data = list(linear_pred = linear_pred, 
##     data = data))
## 
## Coefficients:
##         b01         b11         b21         b02         b12         b22 
##  1.71609801 -0.26076624 -0.42126060  0.05884933  0.43274118 -0.49420132 
## 
## Log-likelihood: -72.97
# behold the estimated probabilities ===================================
results <- lapply(
    seq(k),
    function(i) cbind(inv_logit(fit@coef, linear_pred, data)$ps[ , i],
                      probs[ , i]))
results <- do.call(cbind, results)
colnames(results) <- c(sapply(seq(k),
                              function(i) c(paste0("p", i, "_est"),
                                            paste0("p", i, "_true"))))
head(results, 10)
##           p1_est    p1_true    p2_est   p2_true     p3_est    p3_true
##  [1,] 0.07362905 0.21727022 0.2554734 0.1858614 0.67089756 0.59686839
##  [2,] 0.08184015 0.18000619 0.7122206 0.6689320 0.20593929 0.15106185
##  [3,] 0.11227949 0.16062883 0.7540724 0.7474396 0.13364814 0.09193157
##  [4,] 0.09294784 0.07730113 0.8717613 0.9040967 0.03529081 0.01860213
##  [5,] 0.11314446 0.18527656 0.7044381 0.6819218 0.18241740 0.13280162
##  [6,] 0.15454264 0.21134961 0.6415401 0.6329376 0.20391725 0.15571276
##  [7,] 0.12081038 0.08890887 0.8393728 0.8891595 0.03981685 0.02193163
##  [8,] 0.12135320 0.24683283 0.4764948 0.4165770 0.40215199 0.33659012
##  [9,] 0.16782814 0.15426992 0.7362683 0.7808901 0.09590359 0.06483995
## [10,] 0.12213469 0.11892142 0.8087069 0.8387032 0.06915837 0.04237537
# ======================================================================
# checking the results with the existing r libraries ===================
# ======================================================================
# 4 these libs is better if we put the data in the long format =========
library(reshape2)
data_long <- melt(data, measure.vars = seq(k), variable.name = "y")
data_long <- data_long[data_long$value == 1, ]
library(nnet) # --------------------------------------------------------
data_long$y <- relevel(data_long$y, ref = k) # changing the ref. level
# behold, the estimated coefficients are the same ----------------------
(fit_nnet <- multinom(y ~ x1 + x2, data_long))
## # weights:  12 (6 variable)
## initial  value 109.861229 
## iter  10 value 72.972199
## final  value 72.971664 
## converged
## Call:
## multinom(formula = y ~ x1 + x2, data = data_long)
## 
## Coefficients:
##    (Intercept)         x1         x2
## y1  1.71593734 -0.2607501 -0.4212285
## y2  0.05886355  0.4327226 -0.4941669
## 
## Residual Deviance: 145.9433 
## AIC: 157.9433
head(fitted(fit_nnet)) # estimated probabilities, behold
##            y3         y1        y2
## 4  0.03529560 0.09294889 0.8717555
## 5  0.18241968 0.11314438 0.7044359
## 9  0.09591009 0.16782449 0.7362654
## 12 0.13707584 0.24115786 0.6217663
## 16 0.08257656 0.09022618 0.8271973
## 24 0.37499417 0.13547364 0.4895322
library(mlogit) # ------------------------------------------------------
fit_mlogit <- mlogit(y ~ 0 | x1 + x2, reflevel = 3,
                     shape = "wide", data_long)
# behold, the estimated coefficients are the same ----------------------
matrix(fit_mlogit$coef, nrow = 2, ncol = 3)
##            [,1]       [,2]       [,3]
## [1,] 1.71603345 -0.2607527 -0.4212572
## [2,] 0.05885441  0.4327380 -0.4941973

When we look to the estimated probabilities through the packages nnet and mlogit they’re a little strange, due to your confusing order of presentation. However, by the subject id number we can see that the probabilities are equal to the ones found by our implementation.

Another interesting thing (that I saw only now, after doing that), is that the package mlogit uses the nnet package as its implementation algorithm, say.