\(\newcommand{\bm}[1]{\boldsymbol{\mathbf{#1}}}\) logo

Data

Carcinogenesis: Days to Vaginal Cancer Mortality in Rats

dat <- data.frame(id = factor(1:40),
                  time = c(143, 164, 188, 188, 190, 192, 206, 209, 213,
                           216, 220, 227, 230, 234, 246, 265, 304, 216,
                           244, 142, 156, 163, 198, 205, 232, 232, 233,
                           233, 233, 233, 239, 240, 261, 280, 280, 296,
                           296, 323, 204, 344),
                  status = c(rep("1", 17), rep("0", 2),
                             rep("1", 19), rep("0", 2)),
                  group = c(rep("1", 19), rep("2", 21)))
datatable(dat, options = list(pageLength = 5),
          class = 'cell-border stripe', rownames = FALSE)
ggplot(dat, aes(x = time, y = id, label = time)) +
    geom_segment(aes(x = 0, xend = time, y = id, yend = id), col = "gray") +
    geom_point(stat = "identity", aes(col = status), size = 7) +
    geom_text(color = "white", size = 3) +
    scale_color_manual(labels = c("censored", "failured"),
                       values = c("1" = 1, "0" = 2)) +
    facet_wrap(~ group, scales = "free") +
    labs(title = "survival times by treament group",
         subtitle = "pay attention, we have ties at both groups!") +
    theme_classic(base_size = 14)

tapply(dat$time, dat$group, summary)
$`1`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  143.0   191.0   216.0   215.5   232.0   304.0 

$`2`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  142.0   205.0   233.0   239.2   280.0   344.0 

Relative Risk Model

Partial Likelihood for \(\beta\)

\[ \text{relative risk model}: \lambda(t | x) = \lambda_{0}(t) \exp\{Z(t)^{\top}\beta\}, \] under arbitrary independent right censorship and without ties.

The partial likelihood for \(\beta\), under independent censoring, is given by \[ L(\beta) = \prod_{j=1}^{k} \frac{\exp\{Z_{j}(t_{j})^{\top}\beta\}}{\sum_{l \in R(t_{j})} \exp\{Z_{l}(t_{j})^{\top}\beta\}}. \] In the carcinogenesis dataset there are ties among the uncensored failure times, then the partial likelihood need to be adjusted. The average partial likelihood arises from breaking the ties in all possible ways, but it is computationally intensive if the number of ties is large at any failure time. If the ties are not too numerous, the average partial likelihood is well approximated by the Breslow likelihood, often used in practice because its simple form. \[ L(\beta) = \prod_{j=1}^{k} \frac{\exp\{s_{j}(t_{j})^{\top}\beta\}}{ (\sum_{l \in R(t_{j})}\exp\{Z_{l}(t_{j})^{\top}\beta\})^{d_{j}}}. \text{ where } s_{j}(t_{j}) = \sum_{i=1}^{d_{j}} Z_{j_{i}}(t_{j}) \] is the sum of the covariates of individuals observed to fail at \(t_{j}\).

In a first moment, we just “write” down the (log-)likelihood and optimize it via a standard and efficient optimization routine (BFGS).

## rr_lkl: relative risk model log-likelihood
rr_lkl <- function(data, Z, beta) {
    datk <- data %>% filter(status %in% "1") ; datk$id <- seq(nrow(datk))
    tZ <- data.frame(datk$time, Z) ; names(tZ)[1] <- "time"
    times <- sort(unique(tZ$time))
    tZb <- exp(as.matrix(tZ[ , -1]) %*% beta)
    lkl <- numeric(length(times))
    i <- 1
    for (j in times) {
        lkl[i] <-
            colSums(as.matrix(tZ[tZ$time == j, -1])) %*% beta -
            nrow(datk %>% filter(time %in% j)) *
            log(sum(tZb[(datk %>% filter(time >= j))$id]))
        i <- i + 1
    }
    return(-sum(lkl)) ## returning the negative log-likelihood
}
rr_lkl(data = dat, Z = c(rep(0, 17), rep(1, 19)), beta = 0)
[1] 96.68536
guess <- 0
names(guess) <- parnames(rr_lkl) <- c("beta")
moptim <- mle2(rr_lkl, start = guess,
               data = list(data = dat, Z = c(rep(0, 17), rep(1, 19))))
summary(moptim)
Maximum likelihood estimation

Call:
mle2(minuslogl = rr_lkl, start = guess, data = list(data = dat, 
    Z = c(rep(0, 17), rep(1, 19))))

Coefficients:
     Estimate Std. Error z value   Pr(z)  
beta -0.60553    0.34761  -1.742 0.08152 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

-2 log L: 190.3889 
logLik(moptim)
'log Lik.' -95.19445 (df=1)

It would be nicer to see some likelihood regions/curves, but since we have here just one parameter, we need to settle with a deviance profile.

plot(profile(moptim))

Now, we proceed in a more proper manner, i.e., writing down our optimization algorithm (here, a Newton-Raphson procedure).

The maximum likelihood estimate (MLE), \(\hat{\beta}\), can be obtained as a solution to the vector equation \[ U(\beta) = \partial \log L/\partial \beta = \sum_{j=1}^{k} [s_{j}(t_{j}) - \varepsilon(\beta, t_{j})] = 0, \] where \[ \varepsilon(\beta, t_{j}) = d_{j} \sum_{l \in R(t_{j})} Z_{l}(t_{j}) p_{l}(\beta, t_{j}) \quad \text{and} \quad p_{l}(\beta, t_{j}) = \frac{\exp\{Z_{l}(t_{j})^{\top}\beta\}}{ \sum_{i \in R(t_{j})} \exp\{Z_{i}(t_{j})^{\top}\beta\}}, \quad l \in R(t_{j}). \] The observed information matrix is \[ I(\beta) = - \frac{\partial^{2} \log L}{\partial \beta \partial \beta^{\top}} = \sum_{j=1}^{k} \Upsilon(\beta, t_{j}), \quad \text{where} \quad \Upsilon(\beta, t_{j}) = \sum_{l \in R(t_{j})} [Z_{l}(t_{j}) - \varepsilon(\beta, t_{j})]^{\otimes 2} p_{l}(\beta, t_{j}). \]

Then, we implement these functions.

gradHess <- function(data, Z, beta) {
    datk <- data %>% filter(status %in% "1") ; datk$id <- seq(nrow(datk))
    tZ <- as.data.frame(cbind(datk$time, Z)) ; names(tZ)[1] <- "time"
    times <- sort(unique(tZ$time))
    tZb <- exp(as.matrix(tZ[ , -1]) %*% beta)
    grad <- numeric(length(times)) ; i <- 1
    for (j in times) {
        sj <- colSums(as.matrix(tZ[tZ$time == j, -1]))
        d <- nrow(datk %>% filter(time %in% j))
        riskset <- tZb[(datk %>% filter(time >= j))$id]
        denom <- sum(riskset) ; pl <- riskset/denom
        grad[i] <- sj - d * t(as.matrix(tZ[tZ$time >= j, -1])) %*% pl
        i <- i + 1
    }
    Hess <- matrix(0, nrow = length(beta), ncol = length(beta))
    i <- 1
    for (j in times) {
        new_tZ <- tZ %>% filter(time >= j)
        d <- nrow(new_tZ %>% filter(time %in% j))
        riskset <- tZb[(datk %>% filter(time >= j))$id]
        risksum <- sum(riskset)
        pl <- riskset/risksum
        estyle <- new_tZ[ , -1] %*% pl
        loopsize <- nrow(new_tZ) ; inside <- numeric(loopsize)
        for (l in seq(loopsize)) {
            zl <- new_tZ[l, -1]
            core <- zl - estyle
            inside[l] <- t(core) %*% core * pl[l]
        }
        Hess[i] <- d * sum(inside)
        i <- i + 1
    }
    grad <- sum(grad) ; Hess <- -sum(Hess) ; change <- solve(Hess, grad)
return(list(grad = grad, hess = Hess, change = change))
}
gradHess(data = dat, Z = c(rep(0, 17), rep(1, 19)), beta = 0)
$grad
[1] -4.850569

$hess
[1] -7.553145

$change
[1] 0.6421919
## checking the implementation
## pracma::grad(rr_lkl, x0 = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
## pracma::hessian(rr_lkl, x0 = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
## numDeriv::grad(rr_lkl, x = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
## numDeriv::hessian(rr_lkl, x = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))

And finally, our optimization algorithm.

newton_raphson <- function(guess, data, Z, max_iter = 10, tol = 1e-5) {
    sol <- matrix(NA, nrow = max_iter, ncol = length(guess))
    sol[1, ] <- guess
    for (i in 2:max_iter) {
        change <- gradHess(data = data, Z = Z, beta = guess)
        sol[i, ] <- guess - change$change
        guess <- sol[i, ]
        tol_iter <- abs(sol[i, ] - sol[i - 1, ])
        if (all(tol_iter < tol) == TRUE) break
    }
    return(list(value = -rr_lkl(data = data, Z = Z, beta = guess),
                par = guess, std.err = sqrt(1/-change$hess), iter = i - 1))
}
(model <- newton_raphson(guess = 0, data = dat, Z = c(rep(0, 17), rep(1, 19))))
$value
[1] -95.19445

$par
[1] -0.6055475

$std.err
[1] 0.3476125

$iter
[1] 3

And via the R standard implementation.

dat2 <- dat
dat2$status <- ifelse(dat2$status == 1, 2, 1)
(msurv <- coxph(Surv(time, status) ~ group, data = dat2, method = "breslow"))
Call:
coxph(formula = Surv(time, status) ~ group, data = dat2, method = "breslow")

          coef exp(coef) se(coef)     z      p
group2 -0.5959    0.5511   0.3484 -1.71 0.0872

Likelihood ratio test=2.88  on 1 df, p=0.08977
n= 40, number of events= 36 
logLik(msurv)
'log Lik.' -100.7191 (df=1)

Ops, is different = /

Estimation of the Baseline Hazard or Survivor Function

The likelihood function \[ \prod_{i=1}^{k} \left[\prod_{j \in D_{i}} (1 - \alpha_{i}^{\exp\{Z_{i}(t_{j})^{\top}\beta\}}) \prod_{l \in R(t_{i})-D_{i}} \alpha_{i}^{\exp\{Z_{l}(t_{i})^{\top}\beta\}} \right], \] which is to be maximized in \(\alpha_{1}, \dots, \alpha_{k}\).

The estimation of the survivor function can be carried out by the joint estimation of the \(\alpha\)’s and \(\beta\). More simply, however, we can take \(\beta = \hat{\beta}\) as estimated above from the partial likelihood function and then maximize it with respect to \(\alpha_{1}, \dots, \alpha_{k}\). Differentiating the logarithm of that likelihood function with respect to \(\alpha_{i}\) gives the maximum likelihood estimate of \(\alpha_{i}\) as a solution to \[ \sum_{j \in D_{i}} \exp\{Z_{j}(t_{i})\hat{\beta}\} [1 - \alpha_{i}^{\exp\{Z_{j}(t_{i})\hat{\beta}\}}]^{-1} = \sum_{l \in R_{t_{i}}} \exp\{Z_{l}(t_{i})\hat{\beta}\}. \] If only a single failure occurs at \(t_{i}\), the previous equation can be solved directly for \(\hat{\alpha}_{i}\) to give \[ \hat{\alpha}_{i} = \left[ 1 - \frac{\exp\{Z_{i}(t_{i})\hat{\beta}\}}{ \sum_{l \in R(t_{i})} \exp\{Z_{l}(t_{i})\hat{\beta}\}} \right]^{\exp\{-Z_{i}(t_{i})\hat{\beta}\}}. \] Otherwise, an iterative solution is required. A suitable initial value for the iteration is \(\alpha_{i_{0}}\), where \[ \alpha_{i_{0}} = 1 - d_{i} \left[ \sum_{l \in R(t_{i})} \exp\{Z_{l}(t_{i})\hat{\beta}\} \right]^{-1}. \] Note that the \(\hat{\alpha}_{i}\)’s can be calculated separately.

lkl <- function(alpha, tZ, ti, betahat) {
    failured <- as.matrix( (tZ %>% filter(time %in% ti))[ , -1] )
    risk_future <- tZ %>% filter(time > ti)
    core_fail <- exp(failured %*% betahat)
    core_risk <- ifelse(nrow(risk_future) != 0,
                        exp(as.matrix(risk_future[ , -1]) %*% betahat), 1)
    fail_contr <- sum(log(1 - alpha^core_fail))
    risk_contr <- sum(log(alpha^core_risk))
    out <- fail_contr + risk_contr
    return(-out)
}
getalphas <- function(data, Z, betahat) {
    datk <- data %>% filter(status %in% "1") ; datk$id <- seq(nrow(datk))
    tZ <- data.frame(datk$time, Z) ; names(tZ)[1] <- "time"
    ti <- sort(unique(datk$time))
    alphas <- numeric(length(ti))
    j <- 1
    for (i in ti) {
        risk_ti <- as.matrix( (tZ %>% filter(time %in% i))[ , -1] )
        if (nrow(risk_ti) == 1) {
            fail <- exp(risk_ti %*% betahat)
            fail2 <- exp(-risk_ti %*% betahat)
            underiskZ <- as.matrix( (tZ %>% filter(time > i))[ , -1] )
            underisk <- ifelse(nrow(underiskZ) != 0,
                               sum(exp(underiskZ %*% betahat)), 1)
            alphas[j] <- (1 - fail/underisk)^fail2
        }
        else {
            ## proposal <- 1-nrow(risk_ti)/sum(exp(risk_ti %*% betahat))
            proposal <- .5
            names(proposal) <- parnames(lkl) <- "alpha"
            alphas[j] <- mle2(lkl, start = proposal,
                              data = list(tZ = tZ, ti = i, betahat = betahat)
                              )@coef
        }
        j <- j + 1
    }
    out <- as.data.frame(cbind(ti, alphas)) ; names(out) <- c("time", "alpha")
    return(out)
}
(alphas <- getalphas(data = dat, Z = c(rep(0, 17), rep(1, 19)),
                     betahat = model$par))
   time      alpha
1   142  0.9630359
2   143  0.9612763
3   156  0.9607961
4   163  0.9599390
5   164  0.9578635
6   188  0.3333337
7   190  0.9517663
8   192  0.9493220
9   198  0.9484983
10  205  0.9470091
11  206  0.9433134
12  209  0.9399069
13  213  0.9360649
14  216  0.9316980
15  220  0.9266908
16  227  0.9208914
17  230  0.9140957
18  232  0.2587153
19  233  0.1198557
20  234  0.8642449
21  239  0.8582862
22  240  0.8464259
23  246  0.8104141
24  261  0.7987572
25  265  0.7318228
26  280  0.2587153
27  296  0.2587153
28  304 -0.8322550
29  323  0.2355224

The maximum likelihood estimate of the baseline survivor function is \[ \hat{F}_{0}(t) = \prod_{i | t_{i} \leq t} \hat{\alpha}_{i}, \] a step function with discontinuities at each observed failure time \(t_{i}\).

baseF <- function(alphas, data) {
    time <- sort(unique(data$time))
    baseline <- numeric(length(time))
    id <- 1
    for (i in time) {
        baseline[id] <- prod( (alphas %>% filter(time <= i))$alpha )
        id <- id + 1
    }
    out <- data.frame(time, baseline)
    return(out)
}
baseline <- baseF(alphas = alphas, data = dat)

base_surv <- ggplot(baseline, aes(x = time, y = baseline)) +
    geom_step() +
    labs(x = "Failure times", y = NULL,
         title = "Baseline survivor function") +
    theme_minimal()

The corresponding estimate of the cumulative hazard function is \[ \hat{\Lambda}_{0}(t) = \sum (1 - \hat{\alpha}_{i}) I(t_{i} \leq t). \]

cum_hazard <- function(alphas, data) {
    time <- sort(unique(data$time))
    cumh <- numeric(length(time))
    id <- 1
    for (i in time) {
        cumh[id] <- sum(1 - (alphas %>% filter(time <= i))$alpha)
        id <- id + 1
    }
    out <- data.frame(time, cumh)
    return(out)
}
cumhazard <- cum_hazard(alphas = alphas, data = dat)

cum_haz <- ggplot(cumhazard, aes(x = time, y = cumh)) +
    geom_step() +
    labs(x = "Failure times", y = NULL,
         title = "Cumulative hazard function") +
    theme_minimal()

base_surv | cum_haz

The estimated survivor function for a constant covariate vector \(Z\) is \[ \hat{F}(t | x_{0}) = [\hat{F}_{0}(t)]^{\exp\{Z^{\top}\hat{\beta}\}}. \]

surv <- function(data, g, Zg, betahat, base) {
    datk <- data %>% filter(status %in% "1" & group %in% g)
    timeg <- unique(datk$time)
    tZ <- data.frame(timeg, Zg)
    covs <- exp(tZ[ , -1] %% betahat)
    Fhat <- c(1, (base %>% filter(time %in% timeg))$baseline^covs)
    out <- data.frame(c(0, timeg), Fhat) ; names(out) <- c("time", "Fhat")
    return(out)
}
(survg1 <- surv(data = dat, g = 1, Zg = rep(0, 16),
                betahat = model$par, base = baseline))
   time          Fhat
1     0  1.000000e+00
2   143  9.257436e-01
3   164  8.178415e-01
4   188  2.726141e-01
5   190  2.594650e-01
6   192  2.463158e-01
7   206  2.087079e-01
8   209  1.961660e-01
9   213  1.836241e-01
10  216  1.710822e-01
11  220  1.585403e-01
12  227  1.459984e-01
13  230  1.334565e-01
14  234  3.576493e-03
15  246  2.105647e-03
16  265  1.230853e-03
17  304 -6.856571e-05
(survg2 <- surv(data = dat, g = 2, Zg = rep(1, 13),
                betahat = model$par, base = baseline))
   time         Fhat
1     0 1.0000000000
2   142 0.9699634268
3   156 0.9095030346
4   163 0.8798869219
5   198 0.3081055452
6   205 0.2948176826
7   232 0.0655169819
8   233 0.0117583463
9   239 0.0092321614
10  240 0.0080662628
11  261 0.0056720259
12  280 0.0014740487
13  296 0.0004932596
14  323          NaN
nr_surv <- ggplot(survg1, aes(x = time, y = Fhat)) +
    geom_step(aes(color = "1"), size = .85) +
    geom_step(data = survg2, aes(color = "2"), size = .85) +
    labs(color = "Group", x = "Failure times", y = NULL,
         title = "Survivor function estimates", subtitle = "Mine") +
    scale_color_manual(values = c(1, 2)) +
    theme_minimal()

coxph_surv <- ggadjustedcurves(msurv, data = dat2,
                               method = "marginal", variable = "group",
                               palette = c(1, 2), ggtheme = theme_minimal(),
                               xlab = "Failure times", ylab = NULL,
                               main = "Survivor function estimates",
                               submain = "Not mine", legend.title = "Group")
nr_surv | coxph_surv

Something is wrong at my code. However, I don’t know what it is.

Reference:

@book{ ,
      title = {The {S}tatistical {A}nalysis of {F}ailure {T}ime {D}ata},
      author = {Kalbfleisch, J. D. and Prentice, R. L.},
      edition = {2nd},
      publisher = {John Wiley \& Sons},
      year = 2002
}