Exercises - Chapter 8: Permutation Tests
Students:
- Henrique Aparecido Laureano
- Elainy Marciano Batista
- Heidi Mara do Rosário Souza
- Joubert Miranda Guedes
- Nicolas Romano
- Pedro Luciano de Oliveira Gomes
- Ricardo de Faria Souza
- Telma Tompson
- Thais Castelo Branco Monho
Exercises from chapter 8 of the book:
Maria L. Rizzo, 2008. Statistical Computing with R, Chapman & Hall/CRC, ISBN 1-58488-545-9, 393 pages.
Exercise 8.1
Implement the two-sample Cramér-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.
Solution:
data("chickwts")
## Packages
library(latticeExtra)
## function: two-sample Cramér-von Mises test for equal distributions
cvm <- function(x, y, data){
r <- 10000 # Permutation samples
reps <- vector("numeric", r)
n <- length(x)
m <- length(y)
v.n <- vector("numeric", n) # Replication vectors
v1.n <- vector("numeric", n)
v.m <- vector("numeric", m)
v1.m <- vector("numeric", m)
z <- c(x, y)
N <- length(z)
for (i in 1:n) v.n[i] <- ( x[i] - i )**2
for (j in 1:m) v.m[j] <- ( y[j] - j )**2
# Test statistic
reps_0 <- ( (n * sum(v.n) + m * sum(v.m)) / (m * n * N) ) -
(4 * m * n - 1) / (6 * N)
for (k in 1:r) { # Permautation samples
w <- sample(N, size = n, replace = FALSE)
x1 <- sort( z[ w] )
y1 <- sort( z[-w] )
for (i in 1:n) { v1.n[i] <- ( x1[i] - i )**2 }
for (j in 1:m) { v1.m[j] <- ( y1[j] - j )**2 }
reps[k] <- ( (n * sum(v1.n) + m * sum(v1.m)) / (m * n * N) ) -
(4 * m * n - 1) / (6 * N)
}
p <- mean( c(reps_0, reps) >= reps_0 )
return(
histogram(c(reps_0, reps) # Histogram
, type = "density"
, col = "#0080ff"
, xlab = "Replicates of Cramér-Von Mises (CVM) statistic"
, ylab = list(rot = 0)
, main = paste0(
"Data: ", data, "\n"
, "Permutation distribution of CVM statistic")
, sub = list(substitute(paste(hat(p), " = ",pvalue)
, list(pvalue = p))
, col = 2)
, panel = function(...){
panel.histogram(...)
panel.abline(v = reps_0, col = 2, lwd = 2)
})
)
}
## Data: Example 8.1
x <- with(chickwts, sort(as.vector(weight[feed == "soybean"])))
y <- with(chickwts, sort(as.vector(weight[feed == "linseed"])))
cvm_8.1 <- cvm(x, y, "Example 8.1")
## Data: Example 8.2
x <- with(chickwts, sort(as.vector(weight[feed == "sunflower"])))
y <- with(chickwts, sort(as.vector(weight[feed == "linseed"])))
cvm_8.2 <- cvm(x, y, "Example 8.2")
## Results
print(cvm_8.1, position = c(0, 0, .5, 1), more = TRUE)
print(cvm_8.2, position = c(.5, 0, 1, 1))
Exercise 8.2
Implement the bivariate Spearman rank correlation test for independence [255] as a permutation test. The Spearman rank correlation test statistic can be obtained from function cor
with method = "spearman"
. Compare the achieved significance level of the permutation test with the p-value reported by cor.test
on the same samples.
Solution:
data("iris")
## Dataset
z <- as.matrix(iris[1:50, 1:4])
x <- (z[ , 1:2])
y <- (z[ , 3:4])
cor(x,y, method = "spearman")
Petal.Length Petal.Width
Sepal.Length 0.2788849 0.2994989
Sepal.Width 0.1799110 0.2865359
cor.test(x, y, method = "spearman", exact = FALSE)$estimate
rho
0.8292811
## Packages
library(boot)
## function: bivariate Spearman rank correlation test for independence
spear.cor <- function(z) {
rho.est <- function(z, i) { # Test statistic
x <- z[ , 1:2]
y <- z[i, 3:4]
return(cor.test(x, y, method = "spearman", exact = FALSE)$estimate)
}
perm <- boot(data = z # 10000 permutation samples
, statistic = rho.est
, sim = "permutation"
, R = 10000)
p <- with(perm, mean( c(t, t0) >= t0 )) # p-value
return(
histogram(with(perm, c(t, t0)) # Histogram
, type = "density"
, col = "#0080ff"
, xlab = "Replicates correlation"
, ylab = list(rot = 0)
, main = paste(
"Permutation distribution of the bivariate Spearman",
"rank correlation test")
, sub = list(substitute(paste(hat(p), " = ",pvalue)
, list(pvalue = p))
, col = 2)
, panel = function(...){
panel.histogram(...)
panel.abline(v = perm$t0, col = 2, lwd = 2)
})
)
}
spear.cor(z)
Exercise 8.3
The Count 5 test for equal variances in Section 6.4 is based on the maximum number of extreme points. Example 6.15 shows that the Count 5 criterion is not applicable for unequal sample sizes. Implement a permutation test for equal variance based on the maximum number of extreme points that applies when sample sizes are not necessarily equal.
Solution:
## Dataset
n_1 <- 20 # Defining the sample sizes
n_2 <- 30
x <- rnorm(n_1, 0, 5)
y <- rnorm(n_2, 0, 5)
## function: permutation test for equal variance based
## on the maximum number of extreme points
count5 <- function(x, y) {
count5test <- function(x, y) { # Test statistic
X <- x - mean(x)
Y <- y - mean(y)
outx <- sum(X > max(Y)) + sum(X < min(Y))
outy <- sum(Y > max(X)) + sum(Y < min(X))
# Return 1 (reject) or 0 (do not reject H_{0})
return( as.integer( max(c(outx, outy) ) > 5) )
}
r <- 10000 # Permutation samples
z <- c(x, y)
n <- length(z)
reps <- vector("numeric", r)
t0 <- count5test(x, y)
for (i in 1:r){ # Permutation test
k = sample(n, size = length(x), replace = FALSE)
reps[i] = count5test(z[k], z[-k])
}
p <-
ifelse(t0 == 0, mean( c(t0, reps) > t0 ), mean( c(t0, reps) >= t0 ))
return(
histogram(c(t0, reps) # Histogram
, type = "density"
, col = "#0080ff"
, xlab = "Replicates of the Count 5 test"
, ylab = list(rot = 0)
, main = "Permutation distribution of the Count 5 test"
, sub = list(substitute(paste(hat(p), " = ",pvalue)
, list(pvalue = p))
, col = 2)
, panel = function(...){
panel.histogram(...)
panel.abline(v = t0, col = 2, lwd = 2)
})
)
}
count5(x, y)
Exercise 8.4
Complete the steps to implement a \(r^{th}\) -nearest neighbors test for equal distributions. Write a function to compute the test statistic. The function should take the data matrix as its first argument, and an index vector as the second argument. The number of nearest neighbors r should follow the index argument.
Solution:
data("chickwts")
## Dataset
x <- with(chickwts, as.vector(weight[feed == "sunflower"]))
y <- with(chickwts, as.vector(weight[feed == "linseed"]))
z <- cbind( c(x, y), rep(0, length(c(x, y))) )
## Packages
library(boot)
library(FNN)
library(latticeExtra)
## function: r^{th} -nearest neighbors test for equal distributions
my.knn <- function(z, nn) {
t_n.i <- function(z, nn, i) {
n_g <- nrow(z) / 2 # length x and length y
n <- nrow(z)
z <- z[i, ]
z <- cbind(z, rep(0, n))
knn <- get.knn(z, k = nn) # Obtaining the k nearest neighbors
n_1 <- knn$nn.index[1:n_g, ] # Dividing x
n_2 <- knn$nn.index[(n_g + 1):n, ] # and y
i_1 <- sum(n_1 < n_g + .5) # Obtaining the sum of
i_2 <- sum(n_2 > n_g + .5) # the indicator functions
return( (i_1 + i_2) / (nn * n) ) # Test statistic
}
perm <- boot(data = z # 10000 permutation samples
, statistic = t_n.i
, sim = "permutation"
, R = 10000
, nn = 3)
p <- with(perm, mean( c(t, t0) >= t0 )) # p-value
return(
histogram(with(perm, c(t, t0)) # Histogram
, type = "density"
, col = "#0080ff"
, xlim = c(-.1, 1.1)
, xlab = paste0("Replicates of T(n, ", nn, ") statistic")
, ylab = list(rot = 0)
, main = paste0("Permutation distribution of T(n, "
, nn, ") statistic")
, sub = list(substitute(paste(hat(p), " = ", pvalue)
, list(pvalue = p))
, col = 2)
, panel = function(...){
panel.histogram(...)
panel.abline(v = p, col = 2, lwd = 2)
})
)
}
my.knn(z, 3) # k (or r) nearest neighbors = 3