Exercises - Chapter 8: Permutation Tests


Students:

  • Henrique Aparecido Laureano
  • Elainy Marciano Batista
  • Heidi Mara do Rosário Souza
  • Joubert Miranda Guedes
  • Nicolas Romano
  • Pedro Luciano de Oliveira Gomes
  • Ricardo de Faria Souza
  • Telma Tompson
  • Thais Castelo Branco Monho

Exercises from chapter 8 of the book:

Maria L. Rizzo, 2008. Statistical Computing with R, Chapman & Hall/CRC, ISBN 1-58488-545-9, 393 pages.


Exercise 8.1


Implement the two-sample Cramér-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.


Solution:

data("chickwts")

## Packages
library(latticeExtra)

## function: two-sample Cramér-von Mises test for equal distributions
cvm <- function(x, y, data){
  r <- 10000 # Permutation samples
  reps <- vector("numeric", r)
  n <- length(x)
  m <- length(y)
  v.n <- vector("numeric", n) # Replication vectors
  v1.n <- vector("numeric", n)
  v.m <- vector("numeric", m)
  v1.m <- vector("numeric", m)
  z <- c(x, y)
  N <- length(z)
  for (i in 1:n) v.n[i] <- ( x[i] - i )**2
  for (j in 1:m) v.m[j] <- ( y[j] - j )**2
  # Test statistic
  reps_0 <- ( (n * sum(v.n) + m * sum(v.m)) / (m * n * N) ) -
    (4 * m * n - 1) / (6 * N)
  for (k in 1:r) { # Permautation samples
    w <- sample(N, size = n, replace = FALSE)
    x1 <- sort( z[ w] )
    y1 <- sort( z[-w] )
    for (i in 1:n) { v1.n[i] <- ( x1[i] - i )**2 }
    for (j in 1:m) { v1.m[j] <- ( y1[j] - j )**2 }
    reps[k] <- ( (n * sum(v1.n) + m * sum(v1.m)) / (m * n * N) ) -
      (4 * m * n - 1) / (6 * N)
  }
  p <- mean( c(reps_0, reps) >= reps_0 )
  return(
    histogram(c(reps_0, reps) # Histogram
              , type = "density"
              , col = "#0080ff"
              , xlab = "Replicates of Cramér-Von Mises (CVM) statistic"
              , ylab = list(rot = 0)
              , main = paste0(
                "Data: ", data, "\n"
                , "Permutation distribution of CVM statistic")
              , sub = list(substitute(paste(hat(p), " = ",pvalue)
                                      , list(pvalue = p))
                           , col = 2)
              , panel = function(...){
                panel.histogram(...)
                panel.abline(v = reps_0, col = 2, lwd = 2)
              })
  )
}
## Data: Example 8.1
x <- with(chickwts, sort(as.vector(weight[feed == "soybean"])))
y <- with(chickwts, sort(as.vector(weight[feed == "linseed"])))

cvm_8.1 <- cvm(x, y, "Example 8.1")

## Data: Example 8.2
x <- with(chickwts, sort(as.vector(weight[feed == "sunflower"])))
y <- with(chickwts, sort(as.vector(weight[feed == "linseed"])))

cvm_8.2 <- cvm(x, y, "Example 8.2")

## Results
print(cvm_8.1, position = c(0, 0, .5, 1), more = TRUE)
print(cvm_8.2, position = c(.5, 0, 1, 1))


Exercise 8.2


Implement the bivariate Spearman rank correlation test for independence [255] as a permutation test. The Spearman rank correlation test statistic can be obtained from function cor with method = "spearman". Compare the achieved significance level of the permutation test with the p-value reported by cor.test on the same samples.


Solution:

data("iris")

## Dataset
z <- as.matrix(iris[1:50, 1:4])
x <- (z[ , 1:2])
y <- (z[ , 3:4])

cor(x,y, method = "spearman")
             Petal.Length Petal.Width
Sepal.Length    0.2788849   0.2994989
Sepal.Width     0.1799110   0.2865359
cor.test(x, y, method = "spearman", exact = FALSE)$estimate
      rho 
0.8292811 
## Packages
library(boot)

## function: bivariate Spearman rank correlation test for independence
spear.cor <- function(z) {
  rho.est <- function(z, i) { # Test statistic
    x <- z[ , 1:2] 
    y <- z[i, 3:4]
    return(cor.test(x, y, method = "spearman", exact = FALSE)$estimate)
  }
  perm <- boot(data = z # 10000 permutation samples
               , statistic = rho.est
               , sim = "permutation"
               , R = 10000)
  p <- with(perm, mean( c(t, t0) >= t0 )) # p-value
  return(
    histogram(with(perm, c(t, t0)) # Histogram
              , type = "density"
              , col = "#0080ff"
              , xlab = "Replicates correlation"
              , ylab = list(rot = 0)
              , main = paste(
                "Permutation distribution of the bivariate Spearman",
                "rank correlation test")
              , sub = list(substitute(paste(hat(p), " = ",pvalue)
                                      , list(pvalue = p))
                           , col = 2)
              , panel = function(...){
                panel.histogram(...)
                panel.abline(v = perm$t0, col = 2, lwd = 2)
              })
  )
}
spear.cor(z)


Exercise 8.3


The Count 5 test for equal variances in Section 6.4 is based on the maximum number of extreme points. Example 6.15 shows that the Count 5 criterion is not applicable for unequal sample sizes. Implement a permutation test for equal variance based on the maximum number of extreme points that applies when sample sizes are not necessarily equal.


Solution:

## Dataset
n_1 <- 20 # Defining the sample sizes
n_2 <- 30
x <- rnorm(n_1, 0, 5)
y <- rnorm(n_2, 0, 5)

## function: permutation test for equal variance based
##           on the maximum number of extreme points
count5 <- function(x, y) {
  count5test <- function(x, y) { # Test statistic
    X <- x - mean(x)
    Y <- y - mean(y)
    outx <- sum(X > max(Y)) + sum(X < min(Y))
    outy <- sum(Y > max(X)) + sum(Y < min(X))
    # Return 1 (reject) or 0 (do not reject H_{0})
    return( as.integer( max(c(outx, outy) ) > 5) )
  }
  r <- 10000 # Permutation samples
  z <- c(x, y)
  n <- length(z)
  reps <- vector("numeric", r)
  t0 <- count5test(x, y)
  for (i in 1:r){ # Permutation test
    k = sample(n, size = length(x), replace = FALSE)
    reps[i] = count5test(z[k], z[-k])
  }
  p <-
    ifelse(t0 == 0, mean( c(t0, reps) > t0 ), mean( c(t0, reps) >= t0 ))
  return(
    histogram(c(t0, reps) # Histogram
              , type = "density"
              , col = "#0080ff"
              , xlab = "Replicates of the Count 5 test"
              , ylab = list(rot = 0)
              , main = "Permutation distribution of the Count 5 test"
              , sub = list(substitute(paste(hat(p), " = ",pvalue)
                                      , list(pvalue = p))
                           , col = 2)
              , panel = function(...){
                panel.histogram(...)
                panel.abline(v = t0, col = 2, lwd = 2)
              })
  )
}
count5(x, y)


Exercise 8.4


Complete the steps to implement a \(r^{th}\) -nearest neighbors test for equal distributions. Write a function to compute the test statistic. The function should take the data matrix as its first argument, and an index vector as the second argument. The number of nearest neighbors r should follow the index argument.


Solution:

data("chickwts")

## Dataset
x <- with(chickwts, as.vector(weight[feed == "sunflower"]))
y <- with(chickwts, as.vector(weight[feed == "linseed"]))
z <- cbind( c(x, y), rep(0, length(c(x, y))) )

## Packages
library(boot)
library(FNN)
library(latticeExtra)

## function: r^{th} -nearest neighbors test for equal distributions
my.knn <- function(z, nn) {
  t_n.i <- function(z, nn, i) {
    n_g <- nrow(z) / 2 # length x and length y
    n <- nrow(z)
    z <- z[i, ]
    z <- cbind(z, rep(0, n))
    knn <- get.knn(z, k = nn) # Obtaining the k nearest neighbors
    n_1 <- knn$nn.index[1:n_g, ]       # Dividing x
    n_2 <- knn$nn.index[(n_g + 1):n, ] # and y
    i_1 <- sum(n_1 < n_g + .5) # Obtaining the sum of
    i_2 <- sum(n_2 > n_g + .5) # the indicator functions
    return( (i_1 + i_2) / (nn * n) ) # Test statistic
  }
  perm <- boot(data = z # 10000 permutation samples
               , statistic = t_n.i
               , sim = "permutation"
               , R = 10000
               , nn = 3)
  p <- with(perm, mean( c(t, t0) >= t0 )) # p-value
  return(
    histogram(with(perm, c(t, t0)) # Histogram
              , type = "density"
              , col = "#0080ff"
              , xlim = c(-.1, 1.1)
              , xlab = paste0("Replicates of T(n, ", nn, ") statistic")
              , ylab = list(rot = 0)
              , main = paste0("Permutation distribution of T(n, "
                              , nn, ") statistic")
              , sub = list(substitute(paste(hat(p), " = ", pvalue)
                                      , list(pvalue = p))
                           , col = 2)
              , panel = function(...){
                panel.histogram(...)
                panel.abline(v = p, col = 2, lwd = 2)
              })
  )
}
my.knn(z, 3) # k (or r) nearest neighbors = 3