Chapter_07.R · 芝士酷

## -------------------------------- 似然比检验 --------------------------------
rm(list = ls(all = TRUE)) # 清空工作环境
graphics.off() # 清空绘图区域

## 检验问题 1
library(mclust)
data(banknote)
str(banknote)
head(banknote)
banknote_genuine = subset(banknote, Status == 'genuine')[, 2:7]
banknote_counterfeit = subset(banknote, Status == 'counterfeit')[, 2:7]
mu_0 <- sapply(banknote_genuine, mean); mu_0
x_bar <- sapply(banknote_counterfeit, mean); x_bar

n <- dim(banknote_counterfeit)[1]
Sigma <- (n-1) * cov(banknote_counterfeit) / n; Sigma

mu_0 <- as.matrix(mu_0)
x_bar <- as.matrix(x_bar)
n * t(x_bar - mu_0) %*% solve(Sigma) %*% (x_bar - mu_0)

#### Test Problem 2
f <- function(x) log(1 + x)
curve(f, 0, 10, xlab='', ylab='', axes=F, lwd=2)
abline(h=0, v=0)
lines(c(5, 5), c(0, f(5)), lty=2, col='red')
lines(c(0, 5), c(f(5), f(5)), lty=2, col='red')

p <- dim(Sigma)[1]
((n-p) / p) * t(x_bar - mu_0) %*% solve(Sigma) %*% (x_bar - mu_0)
qf(0.05, p, n-p, lower.tail = FALSE)

#### The 95% simultaneous confidence intervals
x_lower <- array(0, dim = p)
x_upper <- array(0, dim = p)
for (i in 1:p){
  x_lower[i] <- x_bar[i] - sqrt(((p)/(n-p)) * 
                                  qf(0.05, p, n-p, lower.tail = FALSE) * Sigma[i, i])
  x_upper[i] <- x_bar[i] + sqrt(((p)/(n-p)) * 
                                  qf(0.05, p, n-p, lower.tail = FALSE) * Sigma[i, i])
}
mu_simul_int <- data.frame(x_lower = x_lower, x_upper = x_upper)
mu_simul_int

a <- matrix(c(0, 0, 0, 1, -1, 0), nrow=6)
t(a) %*% x_bar - sqrt(((p)/(n-p))  * qf(0.05, p, n-p, lower.tail = FALSE) * 
                        t(a) %*% Sigma %*% a)
t(a) %*% x_bar + sqrt(((p)/(n-p))  * qf(0.05, p, n-p, lower.tail = FALSE) * 
                        t(a) %*% Sigma %*% a)

library(car)
a <- c(2, 2)
b <- matrix(c(1, -0.5, -0.5, 1), nrow = 2)
plot(c(0, 4), c(0, 4), type = 'n', xlab = '', ylab = '', axes = F, asp = 1)
ellipse(a, b, radius = 1.5, center.pch = 16, center.cex = 1, fill = TRUE)
abline(h = 0, v = 0)
lines(c(2, 3.3), c(2, 0.7), lty = 2)
lines(c(2, 2.75), c(2, 2.75), lty = 2)

a <- c(2, 2)
b <- matrix(c(1, -0.5, -0.5, 1), nrow = 2)
plot(c(0, 4), c(0, 4), type = 'n', xlab = '', ylab = '', axes = F, asp = 1)
ellipse(a, b, radius = 1.5, center.pch = 16, center.cex=1, fill = TRUE)
abline(h = 0, v = 0)
lines(c(-0.05, 2.55, 4.05, 1.45, -0.05), c(2.55, -0.05, 1.45, 4.05, 2.55), 
      col = 'red', lwd = 2)
points(1.45, 3.7, pch = 16, cex = 1.5, col = 'cyan4')
lines(c(0.15, 1.65), c(2.4, 3.9), lty = 4, lwd = 1.5, col = 'cyan4')
lines(c(3.85, 1.25), c(1.25, 3.9), lty = 4, lwd = 1.5, col = 'cyan4')

##### US companies data
# clear variables and close windows
rm(list = ls(all = TRUE))
graphics.off()

# energy sector data
x = rbind(c(13621, 4848, 4572, 485, 898.9, 23.4), c(1117, 1038, 478, 59.7, 91.7, 3.8), 
          c(1633, 701, 679, 74.3, 135.9, 2.8), c(5651, 1254, 2002, 310.7, 407.9, 6.2), 
          c(5835, 4053, 1601, -93.8, 173.8, 10.8), c(3494, 1653, 1442, 160.9, 320.3, 6.4), 
          c(1654, 451, 779, 84.8, 130.4, 1.6), c(1679, 1354, 687, 93.8, 154.6, 4.6), 
          c(1257, 355, 181, 167.5, 304, 0.6), c(1743, 597, 717, 121.6, 172.4, 3.5), 
          c(1440, 1617, 639, 81.7, 126.4, 3.5), c(14045, 15636, 2754, 418, 1462, 27.3), 
          c(3010, 749, 1120, 146.3, 209.2, 3.4), c(3086, 1739, 1507, 202.7, 335.2, 4.9), 
          c(1995, 2662, 341, 34.7, 100.7, 2.3))
x
# unbiased covariance matrix for energy sector
(cov(x[, 1:2]))
# MLE/empirical variance matrix for energy sector
(S = cov(x[, 1:2]) * 14/15)

# manufacturing sector data
y = rbind(c(1093, 1679, 1070, 100.9, 164.5, 20.8), c(1128, 1516, 430, -47, 26.7, 13.2), 
          c(1804, 2564, 483, 70.5, 164.9, 26.6), c(4662, 4781, 2988, 28.7, 371.5, 66.2), 
          c(6307, 8199, 598, -771.5, -524.3, 57.5), c(2366, 3305, 1117, 131.2, 256.5, 25.2), 
          c(4084, 4346, 3023, 302.7, 521.7, 37.5), c(10348, 5721, 1915, 223.6, 322.5, 49.5), 
          c(752, 2149, 101, 11.1, 15.2, 2.6), c(10528, 14992, 5377, 312.7, 710.7, 184.8))
y
# unbiased variance matrix for manufacturing sector
(cov(y[, 1:2]))
# empirical/MLE variance matrix for manufacturing sector
(sigma = cov(y[, 1:2]) * 9/10)
# testing statistic
(test = 15 * sum(diag(solve(sigma) %*% S)) - 15 * log10(det(solve(sigma) %*% S)) - 15 * 2)
# p-value
(p = pchisq(test, 3, lower.tail = FALSE))

#### Pullovers
# clear variables and close windows
rm(list = ls(all = TRUE))
graphics.off()

x <- c(230, 125, 200, 109, 181, 99, 55, 107, 165, 97, 105, 98, 
       150, 115, 85, 71, 97, 120, 0, 82,
       192, 100, 150, 103, 181, 80, 85, 111, 189, 90, 120, 93, 
       172, 95, 110, 86, 170, 125, 130, 78)
pullover <- matrix(x, ncol = 4, byrow = TRUE)
pullover <- as.data.frame(pullover)
colnames(pullover) <- c("X1", "X2", "X3", "X4")
pullover
X1_2.lm <- lm(X1 ~ X2, data = pullover)
summary(X1_2.lm)
plot(pullover[, 2:1], xlab='X2 (Price)', ylab='X1 (Sales)', axes=FALSE, pch=16, ylim=c(90, 250), 
     xlim=c(75, 130), cex=2)
axis(1)
axis(2)
abline(X1_2.lm, col='red', lwd=2)

# LR test statistic
y <- as.matrix(pullover$X1)
X <- matrix(c(rep(1, 10), pullover$X2), ncol=2, byrow=FALSE)
beta_0 <- matrix(c(211, 0), nrow=2)
beta_hat <- as.matrix(X1_2.lm$coefficients)
n <- 10
(test.stat <- n * log(t(y - X %*% beta_0) %*% (y - X %*% beta_0) / 
                        t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)))

# p-value
pchisq(test.stat, 2, lower.tail = FALSE)

# F test statistic
(ftest.stat <- ((n -2)/2) * (t(y - X %*% beta_0) %*% (y - X %*% beta_0) / 
                               t(y - X %*% beta_hat) %*% (y - X %*% beta_hat) - 1))

# p-value
pf(ftest.stat, 2, n-2, lower.tail = FALSE)

#### Test Problem 6
# clear variables and close windows
rm(list = ls(all = TRUE))
library(mclust)
data(banknote)
banknote_counterfeit = subset(banknote, Status == 'counterfeit')[, 2:7]
x_bar <- as.matrix(sapply(banknote_counterfeit, mean)); x_bar
n <- dim(banknote_counterfeit)[1]
S_f <- (n-1) * cov(banknote_counterfeit) / n; S_f
A <- matrix(c(0, 0, 0, 1, -1, 0), nrow = 1, byrow = TRUE); A
# Test statistic
test.stat <- (n-1) * t(A %*% x_bar) %*% solve(A %*% S_f %*% t(A)) %*% (A %*% x_bar)
test.stat
# p-value
pf(test.stat, 1, n-1, lower.tail = FALSE)

#### Example: Bock
rm(list = ls(all = TRUE))
n <- 40
p <- 4
x_bar <- matrix(c(1.086, 2.544, 2.851, 3.420), nrow = 4)
x <- c(2.902, 2.438, 2.963, 2.183, 2.438, 3.049, 2.775, 2.319,
       2.963, 2.775, 4.281, 2.939, 2.183, 2.319, 2.939, 3.162)
S <- matrix(x, nrow = 4, byrow = TRUE)
C <- matrix(c(1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1), nrow = 3, byrow = TRUE)

# F statistic
(F_obs <- ((n-p-1) / (p-1)) * t(x_bar) %*% t(C) %*% solve(C %*% S %*% t(C)) %*% C %*% x_bar)
(p_value <- pf(F_obs, p-1, n-p+1, lower.tail = FALSE))

# Simultaneous 95% confidence intervals
b1 <- matrix(c(1, -1, 0, 0), nrow = 4)
(t(b1) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b1) %*% S %*% b1))
(t(b1) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b1) %*% S %*% b1))

b2 <- matrix(c(0, 1, -1, 0), nrow = 4)
(t(b2) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b2) %*% S %*% b2))
(t(b2) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b2) %*% S %*% b2))

b3 <- matrix(c(0, 0, 1, -1), nrow = 4)
(t(b3) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b3) %*% S %*% b3))
(t(b3) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b3) %*% S %*% b3))

b4 <- matrix(c(1, -1/3, -1/3, -1/3), nrow = 4)
(t(b4) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b4) %*% S %*% b4))
(t(b4) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b4) %*% S %*% b4))

b5 <- matrix(c(1/3, 1/3, 1/3, -1), nrow = 4)
(t(b5) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b5) %*% S %*% b5))
(t(b5) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b5) %*% S %*% b5))

b6 <- matrix(c(0, 1, 0, -1), nrow = 4)
(t(b6) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b6) %*% S %*% b6))
(t(b6) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * 
                          t(b6) %*% S %*% b6))

#### Test Problem 7
# Classic pullover data, regression of sales X1 on X2, X3, and X4
x <- c(230, 125, 200, 109, 181, 99, 55, 107, 165, 97, 105, 98, 
       150, 115, 85, 71, 97, 120, 0, 82,
       192, 100, 150, 103, 181, 80, 85, 111, 189, 90, 120, 93, 
       172, 95, 110, 86, 170, 125, 130, 78)
pullover <- matrix(x, ncol = 4, byrow = TRUE)
pullover <- as.data.frame(pullover)
colnames(pullover) <- c("X1", "X2", "X3", "X4")
pullover
X1_234 <- lm(X1 ~ X2 + X3 + X4, data = pullover)
summary(X1_234)

# Likelihood Ratio statistic
n <- 10
A <- matrix(c(0, 1, 1/2, 0), nrow = 1)
X <- pullover[, 2:4]
X <- as.matrix(cbind(X_1 = rep(1, 10), X))
beta_hat <- as.matrix(coefficients(X1_234))
beta_tilde <- beta_hat - solve(t(X) %*% X) %*% t(A) %*% 
  solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% A %*% beta_hat
LR <- n * log(sum((pullover$X1 - X %*% beta_tilde)^2) / 
                sum((pullover$X1 - X %*% beta_hat)^2))
LR
# p_value
q <- 4
p_value <- pchisq(LR, q, lower.tail = FALSE)
p_value

# F statistic
p <- 3
F <- ((n-p)/q) * (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% 
                    A %*% beta_hat) / (sum((pullover$X1 - X %*% beta_hat)^2))
F
# p_value
p_value <- pf(F, q, n-p, lower.tail = FALSE)
p_value

#### Test Problem 8 ##### US companies data ####
# clear variables and close windows
rm(list = ls(all = TRUE))
graphics.off()

# energy sector data
x = rbind(c(13621, 4848, 4572, 485, 898.9, 23.4), c(1117, 1038, 478, 59.7, 91.7, 3.8), 
          c(1633, 701, 679, 74.3, 135.9, 2.8), c(5651, 1254, 2002, 310.7, 407.9, 6.2), 
          c(5835, 4053, 1601, -93.8, 173.8, 10.8), c(3494, 1653, 1442, 160.9, 320.3, 6.4), 
          c(1654, 451, 779, 84.8, 130.4, 1.6), c(1679, 1354, 687, 93.8, 154.6, 4.6), 
          c(1257, 355, 181, 167.5, 304, 0.6), c(1743, 597, 717, 121.6, 172.4, 3.5), 
          c(1440, 1617, 639, 81.7, 126.4, 3.5), c(14045, 15636, 2754, 418, 1462, 27.3), 
          c(3010, 749, 1120, 146.3, 209.2, 3.4), c(3086, 1739, 1507, 202.7, 335.2, 4.9), 
          c(1995, 2662, 341, 34.7, 100.7, 2.3))
n1 <- dim(x)[1]; p <- 2
# mean vector for energy sector
(x_mean <- as.matrix(apply(x[, 1:2], 2, mean)))
# MLE/empirical variance matrix for energy sector
(S_1 = cov(x[, 1:2]) * (n1-1)/(n1))

# manufacturing sector data
y = rbind(c(1093, 1679, 1070, 100.9, 164.5, 20.8), c(1128, 1516, 430, -47, 26.7, 13.2), 
          c(1804, 2564, 483, 70.5, 164.9, 26.6), c(4662, 4781, 2988, 28.7, 371.5, 66.2), 
          c(6307, 8199, 598, -771.5, -524.3, 57.5), c(2366, 3305, 1117, 131.2, 256.5, 25.2), 
          c(4084, 4346, 3023, 302.7, 521.7, 37.5), c(10348, 5721, 1915, 223.6, 322.5, 49.5), 
          c(752, 2149, 101, 11.1, 15.2, 2.6), c(10528, 14992, 5377, 312.7, 710.7, 184.8))
n2 <- dim(y)[1]
# mean vector for manufacturing sector
(y_mean <- as.matrix(apply(y[, 1:2], 2, mean)))
# MLE/empirical variance matrix for manufacturing sector
(S_2 = cov(y[, 1:2]) * (n2-1)/(n2))

# Weighted S
(S <- (n1 * S_1 + n2 * S_2) / (n1 + n2))

# F statistic
(F <- n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) * t(x_mean - y_mean) %*% 
    solve(S) %*% (x_mean - y_mean))

# p-value
(pf(F, p, n1+n2-p-1, lower.tail = FALSE))

# The 95% simultaneous confidence intervals for the differences
a <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * 
            qf(0.05, p, n1+n2-p-1) * S[1, 1])
b <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * 
            qf(0.05, p, n1+n2-p-1) * S[2, 2])
x_mean - y_mean - matrix(c(a, b), nrow=2)
x_mean - y_mean + matrix(c(a, b), nrow=2)

#### Test Problem 8 ##### Simulated data ####
library(MASS)
(mu_1 = c(8, 6, 10, 10))
(mu_2 = c(6, 6, 10, 13))
(Sigma_1 = matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), 
                  nrow = 4, byrow = TRUE))
(n1 = 30)
(n2 = 20)
(p = 4)
(X_1 <- mvrnorm(n1 , mu_1, Sigma_1))
(X_2 <- mvrnorm(n2 , mu_2, Sigma_1))

# sample mean vectors and empirical covariance matrices
mean_X1 <- apply(X_1, 2, mean)
round(mean_X1, digits = 3)
mean_X2 <- apply(X_2, 2, mean)
round(mean_X2, digits = 3)
S_1 = cov(X_1) * (n1-1)/(n1)
round(S_1, digits = 3)
S_2 = cov(X_2) * (n2-1)/(n2)
round(S_2, digits = 3)

# F statistic for testing H_0: mu_1 = mu_2
S <- (n1 * S_1 + n2 * S_2) / (n1 + n2)
F <- n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) * 
  t(mean_X1 - mean_X2) %*% solve(S) %*% (mean_X1 - mean_X2)
F
# p-value
pf(F, p, n1+n2-p-1, lower.tail = FALSE)

# The 95% simultaneous confidence intervals for the differences
a1 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * 
             qf(0.05, p, n1+n2-p-1) * S[1, 1])
a2 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * 
             qf(0.05, p, n1+n2-p-1) * S[2, 2])
a3 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * 
             qf(0.05, p, n1+n2-p-1) * S[3, 3])
a4 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * 
             qf(0.05, p, n1+n2-p-1) * S[4, 4])
mean_X1 - mean_X2 - matrix(c(a1, a2, a3, a4), nrow=4)
mean_X1 - mean_X2 + matrix(c(a1, a2, a3, a4), nrow=4)

# Simulate with different Sigma
Sigma_2 = matrix(c(16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16), 
                 nrow = 4, byrow = TRUE)
Y_1 <- mvrnorm(n1 , mu_1, Sigma_2); Y_1
Y_2 <- mvrnorm(n2 , mu_2, Sigma_2); Y_2

# sample mean vectors and empirical covariance matrices
mean_Y1 <- apply(Y_1, 2, mean)
round(mean_Y1, digits = 3)
mean_Y2 <- apply(Y_2, 2, mean)
round(mean_Y2, digits = 3)
S_1 = cov(Y_1) * (n1-1)/(n1)
round(S_1, digits = 3)
S_2 = cov(Y_2) * (n2-1)/(n2)
round(S_2, digits = 3)

# F statistic for testing H_0: mu_1 = mu_2
S <- (n1 * S_1 + n2 * S_2) / (n1 + n2)
F <- n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) * t(mean_Y1 - mean_Y2) %*% 
  solve(S) %*% (mean_Y1 - mean_Y2)
F
# p-value
pf(F, p, n1+n2-p-1, lower.tail = FALSE)

#### Compare the vectors of means of the forged and the genuine banknotes.
rm(list = ls(all = TRUE))
library(mclust)
data(banknote)
banknote_genuine = subset(banknote, Status == 'genuine')[, 2:7]
banknote_counterfeit = subset(banknote, Status == 'counterfeit')[, 2:7]
n_g <- dim(banknote_genuine)[1]
p <- dim(banknote_genuine)[2]
n_f <- dim(banknote_counterfeit)[1]

# sample statistics
mean_g <- apply(banknote_genuine, 2, mean)
round(mean_g, digits = 2)
mean_f <- apply(banknote_counterfeit, 2, mean)
round(mean_f, digits = 2)
Sigma_g <- (n_g - 1) * cov(banknote_genuine) / n_g
round(Sigma_g, digits = 4)
Sigma_f <- (n_f - 1) * cov(banknote_counterfeit) / n_f
round(Sigma_f, digits = 4)

# F statistic for testing H_0: mu_1 = mu_2
S <- (n_g * Sigma_g + n_f * Sigma_f) / (n_g + n_f)
F <- n_g * n_f * (n_g + n_f - p - 1) / (p * (n_g + n_f)^2) * t(mean_g - mean_f) %*% 
  solve(S) %*% (mean_g - mean_f)
F

# p-value
pf(F, p, n_g + n_f - p - 1, lower.tail = FALSE)

# The 95% simultaneous confidence intervals for the differences
a1 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * 
             qf(0.05, p, n_g + n_f - p - 1) * S[1, 1])
a2 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * 
             qf(0.05, p, n_g + n_f - p - 1) * S[2, 2])
a3 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * 
             qf(0.05, p, n_g + n_f - p - 1) * S[3, 3])
a4 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * 
             qf(0.05, p, n_g + n_f - p - 1) * S[4, 4])
a5 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * 
             qf(0.05, p, n_g + n_f - p - 1) * S[5, 5])
a6 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * 
             qf(0.05, p, n_g + n_f - p - 1) * S[6, 6])
mean_g - mean_f - matrix(c(a1, a2, a3, a4, a5, a6), nrow=6)
mean_g - mean_f + matrix(c(a1, a2, a3, a4, a5, a6), nrow=6)

#### Test Problem 9
# likelihood ratio statistic
LR_statistic <- log(det(S)) * (n_g + n_f) - (log(det(Sigma_g)) * n_g + 
                                               log(det(Sigma_f)) * n_f)
LR_statistic

# p-value
k <- 2
m <- 0.5 * (k-1) * p * (p+1)
pchisq(LR_statistic, m, lower.tail = FALSE)

#### Test Problem 10
# test statistic
mean_g <- as.matrix(mean_g)
mean_f <- as.matrix(mean_f)
TS <- t(mean_g - mean_f) %*% solve((1/n_g) * Sigma_g + (1/n_f) * 
                                     Sigma_f) %*% (mean_g - mean_f)
TS

# p-value
pchisq(TS, p, lower.tail = FALSE)

# The 95% simultaneous confidence intervals
a <- array(0, dim = p)
for (j in 1:p) {
  a[j] <- sqrt(qchisq(0.05, p, lower.tail = FALSE) * 
                 (Sigma_g[j, j] / n_g + Sigma_f[j, j] / n_f))
}
a <- as.matrix(a)
Lower <- mean_g - mean_f - a
Upper <- mean_g - mean_f + a
SCI <- data.frame(Lower = Lower, Upper = Upper)
round(SCI, digits = 4)

################### Profile Analysis ####################
## Fig 7.1
rm(list = ls(all = TRUE)) # clear variables
graphics.off() # close plot windows
x <- c(0, 1, 2, 3, 4, 5, 6)
y1 <- c(4.5, 4.0, 3.2, 2.8, 3.0, 3.8, 4.4)
y2 <- c(3.0, 2.6, 2.4, 2.3, 2.5, 3.0, 3.1)
plot(c(0, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', 
     main = 'Population Profiles', asp = 1)
abline(h = 0, v = 0)
lines(x, y1, lty = 1, lwd = 2, col = 'blue')
lines(x, y2, lty = 1, lwd = 2, col = 'red')
axis(1, pos = 0)
axis(2, pos = 0)
for (i in 1:7) lines(c(x[i], x[i]), c(0, y1[i]), lty = 2)

y3 <- y1 + 1.5
plot(c(1, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', 
     main = 'Population Profiles')
abline(h=0, v=1)
lines(x, y1, lty=2, lwd=3)
lines(x, y3, lty=1, lwd=3, col = 'red')
for (i in 1:5) lines(c(x[i], x[i]), c(y1[i], y3[i]), col = 'green4', lty=3)
axis(1, pos = 0)
axis(2, pos = 1)

plot(c(1, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', 
     main = 'Population Profiles')
abline(h=0, v=1)
lines(x, y3, lty=1, lwd=3, col = 'red')
lines(x, y3-0.01, lty=2, lwd=3, col = 'black')
axis(1, pos = 0)
axis(2, pos = 1)

y4 <- rep(2.5, 5)
plot(c(1, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', 
     main = 'Population Profiles')
abline(h=0, v=1)
lines(x, y4, lty=1, lwd=3, col = 'red')
lines(x, y4-0.01, lty=2, lwd=3, col = 'black')
axis(1, pos = 0)
axis(2, pos = 1)

#### Wechsler intelligence test data #### 没有用到
library(gorica)
rm(list = ls(all = TRUE)) # clear variables
graphics.off() # close plot windows
data(wechsler)

## -------------------------------- Boston 房屋数据 --------------------------------
rm(list = ls(all = TRUE))
library(MASS)
str(Boston)
head(Boston)

# ----------------- 变量变换 -----------------
X_1 = log(Boston$crim)
X_2 = Boston$zn / 10
X_3 = log(Boston$indus)
X_4 = Boston$chas
X_5 = log(Boston$nox)
X_6 = log(Boston$rm)
X_7 = Boston$age^(2.5) / 10000
X_8 = log(Boston$dis)
X_9 = log(Boston$rad)
X_10 = log(Boston$tax)
X_11 = (exp(0.4 * Boston$ptratio)) / 1000
X_12 = Boston$black / 100
X_13 = sqrt(Boston$lstat)
X_14 = log(Boston$medv)

# ----------------- 用 X14 分组后变量 X1, X5, X8, X11, X13 均值向量的比较-----------------
boston = data.frame(X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14)
names(boston) = names(Boston)
boston_sub_group1 = subset(boston, boston$medv <= median(boston$medv), 
                           select = c(1, 5, 8, 11, 13))
boston_sub_group2 = subset(boston, boston$medv > median(boston$medv), 
                           select = c(1, 5, 8, 11, 13))

n1 = dim(boston_sub_group1)[1]
p = dim(boston_sub_group1)[2]
n2 = dim(boston_sub_group2)[1]
xbar_1 = sapply(boston_sub_group1, mean)
xbar_2 = sapply(boston_sub_group2, mean)
S_1 = cov(boston_sub_group1) * (n1 - 1) / n1
S_2 = cov(boston_sub_group2) * (n2 - 1) / n2
S = (n1 * S_1 + n2 * S_2) / (n1 + n2)
S = as.matrix(S)
S_inv = solve(S)
F_test = t(as.matrix(xbar_1 - xbar_2)) %*% S_inv %*% as.matrix(xbar_1 - xbar_2) *
  n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2)
F_test

qf(0.05, p, n1 + n2 - p - 1, lower.tail = FALSE)

# ----------------- 联合置信区间 -----------------
mu_L = as.matrix(xbar_1 - xbar_2) - 
  sqrt(qf(0.05, p, n1 + n2 - p - 1, lower.tail = FALSE) * 
         (p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1)) *
         as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5])))
mu_R = as.matrix(xbar_1 - xbar_2) + 
  sqrt(qf(0.05, p, n1 + n2 - p - 1, lower.tail = FALSE) * 
         (p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1)) *
         as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5])))
data.frame(mu_L = mu_L, mu_R = mu_R)

# ----------------- 用 X4 分组后变量 X5, X8, X9, X12, X13, X14 均值向量的比较-----------------
boston_sub_group3 = subset(boston, boston$chas == 1, select = c(5, 8, 9, 12, 13, 14))
boston_sub_group4 = subset(boston, boston$chas == 0, select = c(5, 8, 9, 12, 13, 14))

n3 = dim(boston_sub_group3)[1]
p = dim(boston_sub_group3)[2]
n4 = dim(boston_sub_group4)[1]
xbar_3 = sapply(boston_sub_group3, mean)
xbar_4 = sapply(boston_sub_group4, mean)
S_3 = cov(boston_sub_group3) * (n3 - 1) / n3
S_4 = cov(boston_sub_group4) * (n4 - 1) / n4
S = (n3 * S_3 + n4 * S_4) / (n3 + n4)
S = as.matrix(S)
S_inv = solve(S)
F_test = t(as.matrix(xbar_3 - xbar_4)) %*% S_inv %*% as.matrix(xbar_3 - xbar_4) *
  n3 * n4 * (n3 + n4 - p - 1) / (p * (n3 + n4)^2)
F_test

qf(0.05, p, n3 + n4 - p - 1, lower.tail = FALSE)

# ----------------- 联合置信区间 -----------------
mu_L = as.matrix(xbar_3 - xbar_4) - 
  sqrt(qf(0.05, p, n3 + n4 - p - 1, lower.tail = FALSE) * 
         (p * (n3 + n4)^2) / (n3 * n4 * (n3 + n4 - p - 1)) *
         as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5], S[6, 6])))
mu_R = as.matrix(xbar_3 - xbar_4) + 
  sqrt(qf(0.05, p, n3 + n4 - p - 1, lower.tail = FALSE) * 
         (p * (n3 + n4)^2) / (n3 * n4 * (n3 + n4 - p - 1)) *
         as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5], S[6, 6])))
data.frame(mu_L = mu_L, mu_R = mu_R)

# ----------------- 检验线性约束 -----------------
X_14_lm = lm(medv ~ . , data = boston)
summary(X_14_lm)

par(mfrow = c(2, 3))
plot(X_14_lm, which = 1:6)

# ----------------- 全部回归系数的检验 -----------------
X = as.matrix(model.matrix(X_14_lm))
n = dim(boston)[1]
p = dim(boston)[2] - 1
q = p
beta_hat = as.matrix(X_14_lm$coefficients)
y = as.matrix(boston$medv)
A = array(0, dim = c(13, 14))
for (i in 1:13) A[i, i+1] = 1
F_test = (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% 
            (A %*% beta_hat) /
  t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)) * (n - p) / q
F_test

qf(0.05, q, n - p, lower.tail = FALSE)

# ----------------- 是否河景房 X4 对房价影响的检验 -----------------
q = 1
A = array(0, dim = c(1, 14))
A[5] = 1
A = as.matrix(A)
F_test = (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% 
            (A %*% beta_hat) /
            t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)) * (n - p) / q
F_test

qf(0.05, q, n - p, lower.tail = FALSE)

# ----------------- 变量 X1, X2, X3, X7 的显著性检验 -----------------
q = 4
A = array(0, dim = c(4, 14))
for (i in 1:3) A[i, i+1] = 1
A[4, 8] = 1
A = as.matrix(A)
F_test = (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% 
            (A %*% beta_hat) /
            t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)) * (n - p) / q
F_test

qf(0.05, q, n - p, lower.tail = FALSE)

# ----------------- 剔除变量 X1, X2, X3, X7 的简化模型 -----------------
X_14_lm_reduced = lm(medv ~ chas + nox + rm + dis + rad + tax + ptratio + 
                       black + lstat , data = boston)
summary(X_14_lm_reduced)