dat <- data.frame(id = factor(1:40),
time = c(143, 164, 188, 188, 190, 192, 206, 209, 213,
216, 220, 227, 230, 234, 246, 265, 304, 216,
244, 142, 156, 163, 198, 205, 232, 232, 233,
233, 233, 233, 239, 240, 261, 280, 280, 296,
296, 323, 204, 344),
status = c(rep("1", 17), rep("0", 2),
rep("1", 19), rep("0", 2)),
group = c(rep("1", 19), rep("2", 21)))
datatable(dat, options = list(pageLength = 5),
class = 'cell-border stripe', rownames = FALSE)
ggplot(dat, aes(x = time, y = id, label = time)) +
geom_segment(aes(x = 0, xend = time, y = id, yend = id), col = "gray") +
geom_point(stat = "identity", aes(col = status), size = 7) +
geom_text(color = "white", size = 3) +
scale_color_manual(labels = c("censored", "failured"),
values = c("1" = 1, "0" = 2)) +
facet_wrap(~ group, scales = "free") +
labs(title = "survival times by treament group",
subtitle = "pay attention, we have ties at both groups!") +
theme_classic(base_size = 14)
tapply(dat$time, dat$group, summary)
$`1`
Min. 1st Qu. Median Mean 3rd Qu. Max.
143.0 191.0 216.0 215.5 232.0 304.0
$`2`
Min. 1st Qu. Median Mean 3rd Qu. Max.
142.0 205.0 233.0 239.2 280.0 344.0
\[ \text{relative risk model}: \lambda(t | x) = \lambda_{0}(t) \exp\{Z(t)^{\top}\beta\}, \] under arbitrary independent right censorship and without ties.
The partial likelihood for \(\beta\), under independent censoring, is given by \[ L(\beta) = \prod_{j=1}^{k} \frac{\exp\{Z_{j}(t_{j})^{\top}\beta\}}{\sum_{l \in R(t_{j})} \exp\{Z_{l}(t_{j})^{\top}\beta\}}. \] In the carcinogenesis dataset there are ties among the uncensored failure times, then the partial likelihood need to be adjusted. The average partial likelihood arises from breaking the ties in all possible ways, but it is computationally intensive if the number of ties is large at any failure time. If the ties are not too numerous, the average partial likelihood is well approximated by the Breslow likelihood, often used in practice because its simple form. \[ L(\beta) = \prod_{j=1}^{k} \frac{\exp\{s_{j}(t_{j})^{\top}\beta\}}{ (\sum_{l \in R(t_{j})}\exp\{Z_{l}(t_{j})^{\top}\beta\})^{d_{j}}}. \text{ where } s_{j}(t_{j}) = \sum_{i=1}^{d_{j}} Z_{j_{i}}(t_{j}) \] is the sum of the covariates of individuals observed to fail at \(t_{j}\).
In a first moment, we just “write” down the (log-)likelihood and optimize it via a standard and efficient optimization routine (BFGS).
## rr_lkl: relative risk model log-likelihood
rr_lkl <- function(data, Z, beta) {
datk <- data %>% filter(status %in% "1") ; datk$id <- seq(nrow(datk))
tZ <- data.frame(datk$time, Z) ; names(tZ)[1] <- "time"
times <- sort(unique(tZ$time))
tZb <- exp(as.matrix(tZ[ , -1]) %*% beta)
lkl <- numeric(length(times))
i <- 1
for (j in times) {
lkl[i] <-
colSums(as.matrix(tZ[tZ$time == j, -1])) %*% beta -
nrow(datk %>% filter(time %in% j)) *
log(sum(tZb[(datk %>% filter(time >= j))$id]))
i <- i + 1
}
return(-sum(lkl)) ## returning the negative log-likelihood
}
rr_lkl(data = dat, Z = c(rep(0, 17), rep(1, 19)), beta = 0)
[1] 96.68536
guess <- 0
names(guess) <- parnames(rr_lkl) <- c("beta")
moptim <- mle2(rr_lkl, start = guess,
data = list(data = dat, Z = c(rep(0, 17), rep(1, 19))))
summary(moptim)
Maximum likelihood estimation
Call:
mle2(minuslogl = rr_lkl, start = guess, data = list(data = dat,
Z = c(rep(0, 17), rep(1, 19))))
Coefficients:
Estimate Std. Error z value Pr(z)
beta -0.60553 0.34761 -1.742 0.08152 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-2 log L: 190.3889
logLik(moptim)
'log Lik.' -95.19445 (df=1)
It would be nicer to see some likelihood regions/curves, but since we have here just one parameter, we need to settle with a deviance profile.
plot(profile(moptim))
Now, we proceed in a more proper manner, i.e., writing down our optimization algorithm (here, a Newton-Raphson procedure).
The maximum likelihood estimate (MLE), \(\hat{\beta}\), can be obtained as a solution to the vector equation \[ U(\beta) = \partial \log L/\partial \beta = \sum_{j=1}^{k} [s_{j}(t_{j}) - \varepsilon(\beta, t_{j})] = 0, \] where \[ \varepsilon(\beta, t_{j}) = d_{j} \sum_{l \in R(t_{j})} Z_{l}(t_{j}) p_{l}(\beta, t_{j}) \quad \text{and} \quad p_{l}(\beta, t_{j}) = \frac{\exp\{Z_{l}(t_{j})^{\top}\beta\}}{ \sum_{i \in R(t_{j})} \exp\{Z_{i}(t_{j})^{\top}\beta\}}, \quad l \in R(t_{j}). \] The observed information matrix is \[ I(\beta) = - \frac{\partial^{2} \log L}{\partial \beta \partial \beta^{\top}} = \sum_{j=1}^{k} \Upsilon(\beta, t_{j}), \quad \text{where} \quad \Upsilon(\beta, t_{j}) = \sum_{l \in R(t_{j})} [Z_{l}(t_{j}) - \varepsilon(\beta, t_{j})]^{\otimes 2} p_{l}(\beta, t_{j}). \]
Then, we implement these functions.
gradHess <- function(data, Z, beta) {
datk <- data %>% filter(status %in% "1") ; datk$id <- seq(nrow(datk))
tZ <- as.data.frame(cbind(datk$time, Z)) ; names(tZ)[1] <- "time"
times <- sort(unique(tZ$time))
tZb <- exp(as.matrix(tZ[ , -1]) %*% beta)
grad <- numeric(length(times)) ; i <- 1
for (j in times) {
sj <- colSums(as.matrix(tZ[tZ$time == j, -1]))
d <- nrow(datk %>% filter(time %in% j))
riskset <- tZb[(datk %>% filter(time >= j))$id]
denom <- sum(riskset) ; pl <- riskset/denom
grad[i] <- sj - d * t(as.matrix(tZ[tZ$time >= j, -1])) %*% pl
i <- i + 1
}
Hess <- matrix(0, nrow = length(beta), ncol = length(beta))
i <- 1
for (j in times) {
new_tZ <- tZ %>% filter(time >= j)
d <- nrow(new_tZ %>% filter(time %in% j))
riskset <- tZb[(datk %>% filter(time >= j))$id]
risksum <- sum(riskset)
pl <- riskset/risksum
estyle <- new_tZ[ , -1] %*% pl
loopsize <- nrow(new_tZ) ; inside <- numeric(loopsize)
for (l in seq(loopsize)) {
zl <- new_tZ[l, -1]
core <- zl - estyle
inside[l] <- t(core) %*% core * pl[l]
}
Hess[i] <- d * sum(inside)
i <- i + 1
}
grad <- sum(grad) ; Hess <- -sum(Hess) ; change <- solve(Hess, grad)
return(list(grad = grad, hess = Hess, change = change))
}
gradHess(data = dat, Z = c(rep(0, 17), rep(1, 19)), beta = 0)
$grad
[1] -4.850569
$hess
[1] -7.553145
$change
[1] 0.6421919
## checking the implementation
## pracma::grad(rr_lkl, x0 = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
## pracma::hessian(rr_lkl, x0 = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
## numDeriv::grad(rr_lkl, x = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
## numDeriv::hessian(rr_lkl, x = 0, data = dat, Z = c(rep(0, 17), rep(1, 19)))
And finally, our optimization algorithm.
newton_raphson <- function(guess, data, Z, max_iter = 10, tol = 1e-5) {
sol <- matrix(NA, nrow = max_iter, ncol = length(guess))
sol[1, ] <- guess
for (i in 2:max_iter) {
change <- gradHess(data = data, Z = Z, beta = guess)
sol[i, ] <- guess - change$change
guess <- sol[i, ]
tol_iter <- abs(sol[i, ] - sol[i - 1, ])
if (all(tol_iter < tol) == TRUE) break
}
return(list(value = -rr_lkl(data = data, Z = Z, beta = guess),
par = guess, std.err = sqrt(1/-change$hess), iter = i - 1))
}
(model <- newton_raphson(guess = 0, data = dat, Z = c(rep(0, 17), rep(1, 19))))
$value
[1] -95.19445
$par
[1] -0.6055475
$std.err
[1] 0.3476125
$iter
[1] 3
And via the R
standard implementation.
dat2 <- dat
dat2$status <- ifelse(dat2$status == 1, 2, 1)
(msurv <- coxph(Surv(time, status) ~ group, data = dat2, method = "breslow"))
Call:
coxph(formula = Surv(time, status) ~ group, data = dat2, method = "breslow")
coef exp(coef) se(coef) z p
group2 -0.5959 0.5511 0.3484 -1.71 0.0872
Likelihood ratio test=2.88 on 1 df, p=0.08977
n= 40, number of events= 36
logLik(msurv)
'log Lik.' -100.7191 (df=1)
Ops, is different = /
The likelihood function \[ \prod_{i=1}^{k} \left[\prod_{j \in D_{i}} (1 - \alpha_{i}^{\exp\{Z_{i}(t_{j})^{\top}\beta\}}) \prod_{l \in R(t_{i})-D_{i}} \alpha_{i}^{\exp\{Z_{l}(t_{i})^{\top}\beta\}} \right], \] which is to be maximized in \(\alpha_{1}, \dots, \alpha_{k}\).
The estimation of the survivor function can be carried out by the joint estimation of the \(\alpha\)’s and \(\beta\). More simply, however, we can take \(\beta = \hat{\beta}\) as estimated above from the partial likelihood function and then maximize it with respect to \(\alpha_{1}, \dots, \alpha_{k}\). Differentiating the logarithm of that likelihood function with respect to \(\alpha_{i}\) gives the maximum likelihood estimate of \(\alpha_{i}\) as a solution to \[ \sum_{j \in D_{i}} \exp\{Z_{j}(t_{i})\hat{\beta}\} [1 - \alpha_{i}^{\exp\{Z_{j}(t_{i})\hat{\beta}\}}]^{-1} = \sum_{l \in R_{t_{i}}} \exp\{Z_{l}(t_{i})\hat{\beta}\}. \] If only a single failure occurs at \(t_{i}\), the previous equation can be solved directly for \(\hat{\alpha}_{i}\) to give \[ \hat{\alpha}_{i} = \left[ 1 - \frac{\exp\{Z_{i}(t_{i})\hat{\beta}\}}{ \sum_{l \in R(t_{i})} \exp\{Z_{l}(t_{i})\hat{\beta}\}} \right]^{\exp\{-Z_{i}(t_{i})\hat{\beta}\}}. \] Otherwise, an iterative solution is required. A suitable initial value for the iteration is \(\alpha_{i_{0}}\), where \[ \alpha_{i_{0}} = 1 - d_{i} \left[ \sum_{l \in R(t_{i})} \exp\{Z_{l}(t_{i})\hat{\beta}\} \right]^{-1}. \] Note that the \(\hat{\alpha}_{i}\)’s can be calculated separately.
lkl <- function(alpha, tZ, ti, betahat) {
failured <- as.matrix( (tZ %>% filter(time %in% ti))[ , -1] )
risk_future <- tZ %>% filter(time > ti)
core_fail <- exp(failured %*% betahat)
core_risk <- ifelse(nrow(risk_future) != 0,
exp(as.matrix(risk_future[ , -1]) %*% betahat), 1)
fail_contr <- sum(log(1 - alpha^core_fail))
risk_contr <- sum(log(alpha^core_risk))
out <- fail_contr + risk_contr
return(-out)
}
getalphas <- function(data, Z, betahat) {
datk <- data %>% filter(status %in% "1") ; datk$id <- seq(nrow(datk))
tZ <- data.frame(datk$time, Z) ; names(tZ)[1] <- "time"
ti <- sort(unique(datk$time))
alphas <- numeric(length(ti))
j <- 1
for (i in ti) {
risk_ti <- as.matrix( (tZ %>% filter(time %in% i))[ , -1] )
if (nrow(risk_ti) == 1) {
fail <- exp(risk_ti %*% betahat)
fail2 <- exp(-risk_ti %*% betahat)
underiskZ <- as.matrix( (tZ %>% filter(time > i))[ , -1] )
underisk <- ifelse(nrow(underiskZ) != 0,
sum(exp(underiskZ %*% betahat)), 1)
alphas[j] <- (1 - fail/underisk)^fail2
}
else {
## proposal <- 1-nrow(risk_ti)/sum(exp(risk_ti %*% betahat))
proposal <- .5
names(proposal) <- parnames(lkl) <- "alpha"
alphas[j] <- mle2(lkl, start = proposal,
data = list(tZ = tZ, ti = i, betahat = betahat)
)@coef
}
j <- j + 1
}
out <- as.data.frame(cbind(ti, alphas)) ; names(out) <- c("time", "alpha")
return(out)
}
(alphas <- getalphas(data = dat, Z = c(rep(0, 17), rep(1, 19)),
betahat = model$par))
time alpha
1 142 0.9630359
2 143 0.9612763
3 156 0.9607961
4 163 0.9599390
5 164 0.9578635
6 188 0.3333337
7 190 0.9517663
8 192 0.9493220
9 198 0.9484983
10 205 0.9470091
11 206 0.9433134
12 209 0.9399069
13 213 0.9360649
14 216 0.9316980
15 220 0.9266908
16 227 0.9208914
17 230 0.9140957
18 232 0.2587153
19 233 0.1198557
20 234 0.8642449
21 239 0.8582862
22 240 0.8464259
23 246 0.8104141
24 261 0.7987572
25 265 0.7318228
26 280 0.2587153
27 296 0.2587153
28 304 -0.8322550
29 323 0.2355224
The maximum likelihood estimate of the baseline survivor function is \[ \hat{F}_{0}(t) = \prod_{i | t_{i} \leq t} \hat{\alpha}_{i}, \] a step function with discontinuities at each observed failure time \(t_{i}\).
baseF <- function(alphas, data) {
time <- sort(unique(data$time))
baseline <- numeric(length(time))
id <- 1
for (i in time) {
baseline[id] <- prod( (alphas %>% filter(time <= i))$alpha )
id <- id + 1
}
out <- data.frame(time, baseline)
return(out)
}
baseline <- baseF(alphas = alphas, data = dat)
base_surv <- ggplot(baseline, aes(x = time, y = baseline)) +
geom_step() +
labs(x = "Failure times", y = NULL,
title = "Baseline survivor function") +
theme_minimal()
The corresponding estimate of the cumulative hazard function is \[ \hat{\Lambda}_{0}(t) = \sum (1 - \hat{\alpha}_{i}) I(t_{i} \leq t). \]
cum_hazard <- function(alphas, data) {
time <- sort(unique(data$time))
cumh <- numeric(length(time))
id <- 1
for (i in time) {
cumh[id] <- sum(1 - (alphas %>% filter(time <= i))$alpha)
id <- id + 1
}
out <- data.frame(time, cumh)
return(out)
}
cumhazard <- cum_hazard(alphas = alphas, data = dat)
cum_haz <- ggplot(cumhazard, aes(x = time, y = cumh)) +
geom_step() +
labs(x = "Failure times", y = NULL,
title = "Cumulative hazard function") +
theme_minimal()
base_surv | cum_haz
The estimated survivor function for a constant covariate vector \(Z\) is \[ \hat{F}(t | x_{0}) = [\hat{F}_{0}(t)]^{\exp\{Z^{\top}\hat{\beta}\}}. \]
surv <- function(data, g, Zg, betahat, base) {
datk <- data %>% filter(status %in% "1" & group %in% g)
timeg <- unique(datk$time)
tZ <- data.frame(timeg, Zg)
covs <- exp(tZ[ , -1] %% betahat)
Fhat <- c(1, (base %>% filter(time %in% timeg))$baseline^covs)
out <- data.frame(c(0, timeg), Fhat) ; names(out) <- c("time", "Fhat")
return(out)
}
(survg1 <- surv(data = dat, g = 1, Zg = rep(0, 16),
betahat = model$par, base = baseline))
time Fhat
1 0 1.000000e+00
2 143 9.257436e-01
3 164 8.178415e-01
4 188 2.726141e-01
5 190 2.594650e-01
6 192 2.463158e-01
7 206 2.087079e-01
8 209 1.961660e-01
9 213 1.836241e-01
10 216 1.710822e-01
11 220 1.585403e-01
12 227 1.459984e-01
13 230 1.334565e-01
14 234 3.576493e-03
15 246 2.105647e-03
16 265 1.230853e-03
17 304 -6.856571e-05
(survg2 <- surv(data = dat, g = 2, Zg = rep(1, 13),
betahat = model$par, base = baseline))
time Fhat
1 0 1.0000000000
2 142 0.9699634268
3 156 0.9095030346
4 163 0.8798869219
5 198 0.3081055452
6 205 0.2948176826
7 232 0.0655169819
8 233 0.0117583463
9 239 0.0092321614
10 240 0.0080662628
11 261 0.0056720259
12 280 0.0014740487
13 296 0.0004932596
14 323 NaN
nr_surv <- ggplot(survg1, aes(x = time, y = Fhat)) +
geom_step(aes(color = "1"), size = .85) +
geom_step(data = survg2, aes(color = "2"), size = .85) +
labs(color = "Group", x = "Failure times", y = NULL,
title = "Survivor function estimates", subtitle = "Mine") +
scale_color_manual(values = c(1, 2)) +
theme_minimal()
coxph_surv <- ggadjustedcurves(msurv, data = dat2,
method = "marginal", variable = "group",
palette = c(1, 2), ggtheme = theme_minimal(),
xlab = "Failure times", ylab = NULL,
main = "Survivor function estimates",
submain = "Not mine", legend.title = "Group")
nr_surv | coxph_surv
Something is wrong at my code. However, I don’t know what it is.
Reference:
@book{ ,
title = {The {S}tatistical {A}nalysis of {F}ailure {T}ime {D}ata},
author = {Kalbfleisch, J. D. and Prentice, R. L.},
edition = {2nd},
publisher = {John Wiley \& Sons},
year = 2002
}