## -------------------------------- 似然比检验 -------------------------------- rm(list = ls(all = TRUE)) # 清空工作环境 graphics.off() # 清空绘图区域 ## 检验问题 1 library(mclust) data(banknote) str(banknote) head(banknote) banknote_genuine = subset(banknote, Status == 'genuine')[, 2:7] banknote_counterfeit = subset(banknote, Status == 'counterfeit')[, 2:7] mu_0 <- sapply(banknote_genuine, mean); mu_0 x_bar <- sapply(banknote_counterfeit, mean); x_bar n <- dim(banknote_counterfeit)[1] Sigma <- (n-1) * cov(banknote_counterfeit) / n; Sigma mu_0 <- as.matrix(mu_0) x_bar <- as.matrix(x_bar) n * t(x_bar - mu_0) %*% solve(Sigma) %*% (x_bar - mu_0) #### Test Problem 2 f <- function(x) log(1 + x) curve(f, 0, 10, xlab='', ylab='', axes=F, lwd=2) abline(h=0, v=0) lines(c(5, 5), c(0, f(5)), lty=2, col='red') lines(c(0, 5), c(f(5), f(5)), lty=2, col='red') p <- dim(Sigma)[1] ((n-p) / p) * t(x_bar - mu_0) %*% solve(Sigma) %*% (x_bar - mu_0) qf(0.05, p, n-p, lower.tail = FALSE) #### The 95% simultaneous confidence intervals x_lower <- array(0, dim = p) x_upper <- array(0, dim = p) for (i in 1:p){ x_lower[i] <- x_bar[i] - sqrt(((p)/(n-p)) * qf(0.05, p, n-p, lower.tail = FALSE) * Sigma[i, i]) x_upper[i] <- x_bar[i] + sqrt(((p)/(n-p)) * qf(0.05, p, n-p, lower.tail = FALSE) * Sigma[i, i]) } mu_simul_int <- data.frame(x_lower = x_lower, x_upper = x_upper) mu_simul_int a <- matrix(c(0, 0, 0, 1, -1, 0), nrow=6) t(a) %*% x_bar - sqrt(((p)/(n-p)) * qf(0.05, p, n-p, lower.tail = FALSE) * t(a) %*% Sigma %*% a) t(a) %*% x_bar + sqrt(((p)/(n-p)) * qf(0.05, p, n-p, lower.tail = FALSE) * t(a) %*% Sigma %*% a) library(car) a <- c(2, 2) b <- matrix(c(1, -0.5, -0.5, 1), nrow = 2) plot(c(0, 4), c(0, 4), type = 'n', xlab = '', ylab = '', axes = F, asp = 1) ellipse(a, b, radius = 1.5, center.pch = 16, center.cex = 1, fill = TRUE) abline(h = 0, v = 0) lines(c(2, 3.3), c(2, 0.7), lty = 2) lines(c(2, 2.75), c(2, 2.75), lty = 2) a <- c(2, 2) b <- matrix(c(1, -0.5, -0.5, 1), nrow = 2) plot(c(0, 4), c(0, 4), type = 'n', xlab = '', ylab = '', axes = F, asp = 1) ellipse(a, b, radius = 1.5, center.pch = 16, center.cex=1, fill = TRUE) abline(h = 0, v = 0) lines(c(-0.05, 2.55, 4.05, 1.45, -0.05), c(2.55, -0.05, 1.45, 4.05, 2.55), col = 'red', lwd = 2) points(1.45, 3.7, pch = 16, cex = 1.5, col = 'cyan4') lines(c(0.15, 1.65), c(2.4, 3.9), lty = 4, lwd = 1.5, col = 'cyan4') lines(c(3.85, 1.25), c(1.25, 3.9), lty = 4, lwd = 1.5, col = 'cyan4') ##### US companies data # clear variables and close windows rm(list = ls(all = TRUE)) graphics.off() # energy sector data x = rbind(c(13621, 4848, 4572, 485, 898.9, 23.4), c(1117, 1038, 478, 59.7, 91.7, 3.8), c(1633, 701, 679, 74.3, 135.9, 2.8), c(5651, 1254, 2002, 310.7, 407.9, 6.2), c(5835, 4053, 1601, -93.8, 173.8, 10.8), c(3494, 1653, 1442, 160.9, 320.3, 6.4), c(1654, 451, 779, 84.8, 130.4, 1.6), c(1679, 1354, 687, 93.8, 154.6, 4.6), c(1257, 355, 181, 167.5, 304, 0.6), c(1743, 597, 717, 121.6, 172.4, 3.5), c(1440, 1617, 639, 81.7, 126.4, 3.5), c(14045, 15636, 2754, 418, 1462, 27.3), c(3010, 749, 1120, 146.3, 209.2, 3.4), c(3086, 1739, 1507, 202.7, 335.2, 4.9), c(1995, 2662, 341, 34.7, 100.7, 2.3)) x # unbiased covariance matrix for energy sector (cov(x[, 1:2])) # MLE/empirical variance matrix for energy sector (S = cov(x[, 1:2]) * 14/15) # manufacturing sector data y = rbind(c(1093, 1679, 1070, 100.9, 164.5, 20.8), c(1128, 1516, 430, -47, 26.7, 13.2), c(1804, 2564, 483, 70.5, 164.9, 26.6), c(4662, 4781, 2988, 28.7, 371.5, 66.2), c(6307, 8199, 598, -771.5, -524.3, 57.5), c(2366, 3305, 1117, 131.2, 256.5, 25.2), c(4084, 4346, 3023, 302.7, 521.7, 37.5), c(10348, 5721, 1915, 223.6, 322.5, 49.5), c(752, 2149, 101, 11.1, 15.2, 2.6), c(10528, 14992, 5377, 312.7, 710.7, 184.8)) y # unbiased variance matrix for manufacturing sector (cov(y[, 1:2])) # empirical/MLE variance matrix for manufacturing sector (sigma = cov(y[, 1:2]) * 9/10) # testing statistic (test = 15 * sum(diag(solve(sigma) %*% S)) - 15 * log10(det(solve(sigma) %*% S)) - 15 * 2) # p-value (p = pchisq(test, 3, lower.tail = FALSE)) #### Pullovers # clear variables and close windows rm(list = ls(all = TRUE)) graphics.off() x <- c(230, 125, 200, 109, 181, 99, 55, 107, 165, 97, 105, 98, 150, 115, 85, 71, 97, 120, 0, 82, 192, 100, 150, 103, 181, 80, 85, 111, 189, 90, 120, 93, 172, 95, 110, 86, 170, 125, 130, 78) pullover <- matrix(x, ncol = 4, byrow = TRUE) pullover <- as.data.frame(pullover) colnames(pullover) <- c("X1", "X2", "X3", "X4") pullover X1_2.lm <- lm(X1 ~ X2, data = pullover) summary(X1_2.lm) plot(pullover[, 2:1], xlab='X2 (Price)', ylab='X1 (Sales)', axes=FALSE, pch=16, ylim=c(90, 250), xlim=c(75, 130), cex=2) axis(1) axis(2) abline(X1_2.lm, col='red', lwd=2) # LR test statistic y <- as.matrix(pullover$X1) X <- matrix(c(rep(1, 10), pullover$X2), ncol=2, byrow=FALSE) beta_0 <- matrix(c(211, 0), nrow=2) beta_hat <- as.matrix(X1_2.lm$coefficients) n <- 10 (test.stat <- n * log(t(y - X %*% beta_0) %*% (y - X %*% beta_0) / t(y - X %*% beta_hat) %*% (y - X %*% beta_hat))) # p-value pchisq(test.stat, 2, lower.tail = FALSE) # F test statistic (ftest.stat <- ((n -2)/2) * (t(y - X %*% beta_0) %*% (y - X %*% beta_0) / t(y - X %*% beta_hat) %*% (y - X %*% beta_hat) - 1)) # p-value pf(ftest.stat, 2, n-2, lower.tail = FALSE) #### Test Problem 6 # clear variables and close windows rm(list = ls(all = TRUE)) library(mclust) data(banknote) banknote_counterfeit = subset(banknote, Status == 'counterfeit')[, 2:7] x_bar <- as.matrix(sapply(banknote_counterfeit, mean)); x_bar n <- dim(banknote_counterfeit)[1] S_f <- (n-1) * cov(banknote_counterfeit) / n; S_f A <- matrix(c(0, 0, 0, 1, -1, 0), nrow = 1, byrow = TRUE); A # Test statistic test.stat <- (n-1) * t(A %*% x_bar) %*% solve(A %*% S_f %*% t(A)) %*% (A %*% x_bar) test.stat # p-value pf(test.stat, 1, n-1, lower.tail = FALSE) #### Example: Bock rm(list = ls(all = TRUE)) n <- 40 p <- 4 x_bar <- matrix(c(1.086, 2.544, 2.851, 3.420), nrow = 4) x <- c(2.902, 2.438, 2.963, 2.183, 2.438, 3.049, 2.775, 2.319, 2.963, 2.775, 4.281, 2.939, 2.183, 2.319, 2.939, 3.162) S <- matrix(x, nrow = 4, byrow = TRUE) C <- matrix(c(1, -1, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1), nrow = 3, byrow = TRUE) # F statistic (F_obs <- ((n-p-1) / (p-1)) * t(x_bar) %*% t(C) %*% solve(C %*% S %*% t(C)) %*% C %*% x_bar) (p_value <- pf(F_obs, p-1, n-p+1, lower.tail = FALSE)) # Simultaneous 95% confidence intervals b1 <- matrix(c(1, -1, 0, 0), nrow = 4) (t(b1) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b1) %*% S %*% b1)) (t(b1) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b1) %*% S %*% b1)) b2 <- matrix(c(0, 1, -1, 0), nrow = 4) (t(b2) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b2) %*% S %*% b2)) (t(b2) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b2) %*% S %*% b2)) b3 <- matrix(c(0, 0, 1, -1), nrow = 4) (t(b3) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b3) %*% S %*% b3)) (t(b3) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b3) %*% S %*% b3)) b4 <- matrix(c(1, -1/3, -1/3, -1/3), nrow = 4) (t(b4) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b4) %*% S %*% b4)) (t(b4) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b4) %*% S %*% b4)) b5 <- matrix(c(1/3, 1/3, 1/3, -1), nrow = 4) (t(b5) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b5) %*% S %*% b5)) (t(b5) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b5) %*% S %*% b5)) b6 <- matrix(c(0, 1, 0, -1), nrow = 4) (t(b6) %*% x_bar - sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b6) %*% S %*% b6)) (t(b6) %*% x_bar + sqrt((p-1)/(n-p+1) * qf(0.05, p-1, n-p+1, lower.tail = FALSE) * t(b6) %*% S %*% b6)) #### Test Problem 7 # Classic pullover data, regression of sales X1 on X2, X3, and X4 x <- c(230, 125, 200, 109, 181, 99, 55, 107, 165, 97, 105, 98, 150, 115, 85, 71, 97, 120, 0, 82, 192, 100, 150, 103, 181, 80, 85, 111, 189, 90, 120, 93, 172, 95, 110, 86, 170, 125, 130, 78) pullover <- matrix(x, ncol = 4, byrow = TRUE) pullover <- as.data.frame(pullover) colnames(pullover) <- c("X1", "X2", "X3", "X4") pullover X1_234 <- lm(X1 ~ X2 + X3 + X4, data = pullover) summary(X1_234) # Likelihood Ratio statistic n <- 10 A <- matrix(c(0, 1, 1/2, 0), nrow = 1) X <- pullover[, 2:4] X <- as.matrix(cbind(X_1 = rep(1, 10), X)) beta_hat <- as.matrix(coefficients(X1_234)) beta_tilde <- beta_hat - solve(t(X) %*% X) %*% t(A) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% A %*% beta_hat LR <- n * log(sum((pullover$X1 - X %*% beta_tilde)^2) / sum((pullover$X1 - X %*% beta_hat)^2)) LR # p_value q <- 4 p_value <- pchisq(LR, q, lower.tail = FALSE) p_value # F statistic p <- 3 F <- ((n-p)/q) * (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% A %*% beta_hat) / (sum((pullover$X1 - X %*% beta_hat)^2)) F # p_value p_value <- pf(F, q, n-p, lower.tail = FALSE) p_value #### Test Problem 8 ##### US companies data #### # clear variables and close windows rm(list = ls(all = TRUE)) graphics.off() # energy sector data x = rbind(c(13621, 4848, 4572, 485, 898.9, 23.4), c(1117, 1038, 478, 59.7, 91.7, 3.8), c(1633, 701, 679, 74.3, 135.9, 2.8), c(5651, 1254, 2002, 310.7, 407.9, 6.2), c(5835, 4053, 1601, -93.8, 173.8, 10.8), c(3494, 1653, 1442, 160.9, 320.3, 6.4), c(1654, 451, 779, 84.8, 130.4, 1.6), c(1679, 1354, 687, 93.8, 154.6, 4.6), c(1257, 355, 181, 167.5, 304, 0.6), c(1743, 597, 717, 121.6, 172.4, 3.5), c(1440, 1617, 639, 81.7, 126.4, 3.5), c(14045, 15636, 2754, 418, 1462, 27.3), c(3010, 749, 1120, 146.3, 209.2, 3.4), c(3086, 1739, 1507, 202.7, 335.2, 4.9), c(1995, 2662, 341, 34.7, 100.7, 2.3)) n1 <- dim(x)[1]; p <- 2 # mean vector for energy sector (x_mean <- as.matrix(apply(x[, 1:2], 2, mean))) # MLE/empirical variance matrix for energy sector (S_1 = cov(x[, 1:2]) * (n1-1)/(n1)) # manufacturing sector data y = rbind(c(1093, 1679, 1070, 100.9, 164.5, 20.8), c(1128, 1516, 430, -47, 26.7, 13.2), c(1804, 2564, 483, 70.5, 164.9, 26.6), c(4662, 4781, 2988, 28.7, 371.5, 66.2), c(6307, 8199, 598, -771.5, -524.3, 57.5), c(2366, 3305, 1117, 131.2, 256.5, 25.2), c(4084, 4346, 3023, 302.7, 521.7, 37.5), c(10348, 5721, 1915, 223.6, 322.5, 49.5), c(752, 2149, 101, 11.1, 15.2, 2.6), c(10528, 14992, 5377, 312.7, 710.7, 184.8)) n2 <- dim(y)[1] # mean vector for manufacturing sector (y_mean <- as.matrix(apply(y[, 1:2], 2, mean))) # MLE/empirical variance matrix for manufacturing sector (S_2 = cov(y[, 1:2]) * (n2-1)/(n2)) # Weighted S (S <- (n1 * S_1 + n2 * S_2) / (n1 + n2)) # F statistic (F <- n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) * t(x_mean - y_mean) %*% solve(S) %*% (x_mean - y_mean)) # p-value (pf(F, p, n1+n2-p-1, lower.tail = FALSE)) # The 95% simultaneous confidence intervals for the differences a <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * qf(0.05, p, n1+n2-p-1) * S[1, 1]) b <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * qf(0.05, p, n1+n2-p-1) * S[2, 2]) x_mean - y_mean - matrix(c(a, b), nrow=2) x_mean - y_mean + matrix(c(a, b), nrow=2) #### Test Problem 8 ##### Simulated data #### library(MASS) (mu_1 = c(8, 6, 10, 10)) (mu_2 = c(6, 6, 10, 13)) (Sigma_1 = matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), nrow = 4, byrow = TRUE)) (n1 = 30) (n2 = 20) (p = 4) (X_1 <- mvrnorm(n1 , mu_1, Sigma_1)) (X_2 <- mvrnorm(n2 , mu_2, Sigma_1)) # sample mean vectors and empirical covariance matrices mean_X1 <- apply(X_1, 2, mean) round(mean_X1, digits = 3) mean_X2 <- apply(X_2, 2, mean) round(mean_X2, digits = 3) S_1 = cov(X_1) * (n1-1)/(n1) round(S_1, digits = 3) S_2 = cov(X_2) * (n2-1)/(n2) round(S_2, digits = 3) # F statistic for testing H_0: mu_1 = mu_2 S <- (n1 * S_1 + n2 * S_2) / (n1 + n2) F <- n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) * t(mean_X1 - mean_X2) %*% solve(S) %*% (mean_X1 - mean_X2) F # p-value pf(F, p, n1+n2-p-1, lower.tail = FALSE) # The 95% simultaneous confidence intervals for the differences a1 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * qf(0.05, p, n1+n2-p-1) * S[1, 1]) a2 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * qf(0.05, p, n1+n2-p-1) * S[2, 2]) a3 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * qf(0.05, p, n1+n2-p-1) * S[3, 3]) a4 <- sqrt(((p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1))) * qf(0.05, p, n1+n2-p-1) * S[4, 4]) mean_X1 - mean_X2 - matrix(c(a1, a2, a3, a4), nrow=4) mean_X1 - mean_X2 + matrix(c(a1, a2, a3, a4), nrow=4) # Simulate with different Sigma Sigma_2 = matrix(c(16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16), nrow = 4, byrow = TRUE) Y_1 <- mvrnorm(n1 , mu_1, Sigma_2); Y_1 Y_2 <- mvrnorm(n2 , mu_2, Sigma_2); Y_2 # sample mean vectors and empirical covariance matrices mean_Y1 <- apply(Y_1, 2, mean) round(mean_Y1, digits = 3) mean_Y2 <- apply(Y_2, 2, mean) round(mean_Y2, digits = 3) S_1 = cov(Y_1) * (n1-1)/(n1) round(S_1, digits = 3) S_2 = cov(Y_2) * (n2-1)/(n2) round(S_2, digits = 3) # F statistic for testing H_0: mu_1 = mu_2 S <- (n1 * S_1 + n2 * S_2) / (n1 + n2) F <- n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) * t(mean_Y1 - mean_Y2) %*% solve(S) %*% (mean_Y1 - mean_Y2) F # p-value pf(F, p, n1+n2-p-1, lower.tail = FALSE) #### Compare the vectors of means of the forged and the genuine banknotes. rm(list = ls(all = TRUE)) library(mclust) data(banknote) banknote_genuine = subset(banknote, Status == 'genuine')[, 2:7] banknote_counterfeit = subset(banknote, Status == 'counterfeit')[, 2:7] n_g <- dim(banknote_genuine)[1] p <- dim(banknote_genuine)[2] n_f <- dim(banknote_counterfeit)[1] # sample statistics mean_g <- apply(banknote_genuine, 2, mean) round(mean_g, digits = 2) mean_f <- apply(banknote_counterfeit, 2, mean) round(mean_f, digits = 2) Sigma_g <- (n_g - 1) * cov(banknote_genuine) / n_g round(Sigma_g, digits = 4) Sigma_f <- (n_f - 1) * cov(banknote_counterfeit) / n_f round(Sigma_f, digits = 4) # F statistic for testing H_0: mu_1 = mu_2 S <- (n_g * Sigma_g + n_f * Sigma_f) / (n_g + n_f) F <- n_g * n_f * (n_g + n_f - p - 1) / (p * (n_g + n_f)^2) * t(mean_g - mean_f) %*% solve(S) %*% (mean_g - mean_f) F # p-value pf(F, p, n_g + n_f - p - 1, lower.tail = FALSE) # The 95% simultaneous confidence intervals for the differences a1 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * qf(0.05, p, n_g + n_f - p - 1) * S[1, 1]) a2 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * qf(0.05, p, n_g + n_f - p - 1) * S[2, 2]) a3 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * qf(0.05, p, n_g + n_f - p - 1) * S[3, 3]) a4 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * qf(0.05, p, n_g + n_f - p - 1) * S[4, 4]) a5 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * qf(0.05, p, n_g + n_f - p - 1) * S[5, 5]) a6 <- sqrt(((p * (n_g + n_f)^2) / (n_g * n_f * (n_g + n_f - p - 1))) * qf(0.05, p, n_g + n_f - p - 1) * S[6, 6]) mean_g - mean_f - matrix(c(a1, a2, a3, a4, a5, a6), nrow=6) mean_g - mean_f + matrix(c(a1, a2, a3, a4, a5, a6), nrow=6) #### Test Problem 9 # likelihood ratio statistic LR_statistic <- log(det(S)) * (n_g + n_f) - (log(det(Sigma_g)) * n_g + log(det(Sigma_f)) * n_f) LR_statistic # p-value k <- 2 m <- 0.5 * (k-1) * p * (p+1) pchisq(LR_statistic, m, lower.tail = FALSE) #### Test Problem 10 # test statistic mean_g <- as.matrix(mean_g) mean_f <- as.matrix(mean_f) TS <- t(mean_g - mean_f) %*% solve((1/n_g) * Sigma_g + (1/n_f) * Sigma_f) %*% (mean_g - mean_f) TS # p-value pchisq(TS, p, lower.tail = FALSE) # The 95% simultaneous confidence intervals a <- array(0, dim = p) for (j in 1:p) { a[j] <- sqrt(qchisq(0.05, p, lower.tail = FALSE) * (Sigma_g[j, j] / n_g + Sigma_f[j, j] / n_f)) } a <- as.matrix(a) Lower <- mean_g - mean_f - a Upper <- mean_g - mean_f + a SCI <- data.frame(Lower = Lower, Upper = Upper) round(SCI, digits = 4) ################### Profile Analysis #################### ## Fig 7.1 rm(list = ls(all = TRUE)) # clear variables graphics.off() # close plot windows x <- c(0, 1, 2, 3, 4, 5, 6) y1 <- c(4.5, 4.0, 3.2, 2.8, 3.0, 3.8, 4.4) y2 <- c(3.0, 2.6, 2.4, 2.3, 2.5, 3.0, 3.1) plot(c(0, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', main = 'Population Profiles', asp = 1) abline(h = 0, v = 0) lines(x, y1, lty = 1, lwd = 2, col = 'blue') lines(x, y2, lty = 1, lwd = 2, col = 'red') axis(1, pos = 0) axis(2, pos = 0) for (i in 1:7) lines(c(x[i], x[i]), c(0, y1[i]), lty = 2) y3 <- y1 + 1.5 plot(c(1, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', main = 'Population Profiles') abline(h=0, v=1) lines(x, y1, lty=2, lwd=3) lines(x, y3, lty=1, lwd=3, col = 'red') for (i in 1:5) lines(c(x[i], x[i]), c(y1[i], y3[i]), col = 'green4', lty=3) axis(1, pos = 0) axis(2, pos = 1) plot(c(1, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', main = 'Population Profiles') abline(h=0, v=1) lines(x, y3, lty=1, lwd=3, col = 'red') lines(x, y3-0.01, lty=2, lwd=3, col = 'black') axis(1, pos = 0) axis(2, pos = 1) y4 <- rep(2.5, 5) plot(c(1, 5), c(0, 5), type = 'n', axes = FALSE, xlab = 'Treatment', ylab = 'Mean', main = 'Population Profiles') abline(h=0, v=1) lines(x, y4, lty=1, lwd=3, col = 'red') lines(x, y4-0.01, lty=2, lwd=3, col = 'black') axis(1, pos = 0) axis(2, pos = 1) #### Wechsler intelligence test data #### 没有用到 library(gorica) rm(list = ls(all = TRUE)) # clear variables graphics.off() # close plot windows data(wechsler) ## -------------------------------- Boston 房屋数据 -------------------------------- rm(list = ls(all = TRUE)) library(MASS) str(Boston) head(Boston) # ----------------- 变量变换 ----------------- X_1 = log(Boston$crim) X_2 = Boston$zn / 10 X_3 = log(Boston$indus) X_4 = Boston$chas X_5 = log(Boston$nox) X_6 = log(Boston$rm) X_7 = Boston$age^(2.5) / 10000 X_8 = log(Boston$dis) X_9 = log(Boston$rad) X_10 = log(Boston$tax) X_11 = (exp(0.4 * Boston$ptratio)) / 1000 X_12 = Boston$black / 100 X_13 = sqrt(Boston$lstat) X_14 = log(Boston$medv) # ----------------- 用 X14 分组后变量 X1, X5, X8, X11, X13 均值向量的比较----------------- boston = data.frame(X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14) names(boston) = names(Boston) boston_sub_group1 = subset(boston, boston$medv <= median(boston$medv), select = c(1, 5, 8, 11, 13)) boston_sub_group2 = subset(boston, boston$medv > median(boston$medv), select = c(1, 5, 8, 11, 13)) n1 = dim(boston_sub_group1)[1] p = dim(boston_sub_group1)[2] n2 = dim(boston_sub_group2)[1] xbar_1 = sapply(boston_sub_group1, mean) xbar_2 = sapply(boston_sub_group2, mean) S_1 = cov(boston_sub_group1) * (n1 - 1) / n1 S_2 = cov(boston_sub_group2) * (n2 - 1) / n2 S = (n1 * S_1 + n2 * S_2) / (n1 + n2) S = as.matrix(S) S_inv = solve(S) F_test = t(as.matrix(xbar_1 - xbar_2)) %*% S_inv %*% as.matrix(xbar_1 - xbar_2) * n1 * n2 * (n1 + n2 - p - 1) / (p * (n1 + n2)^2) F_test qf(0.05, p, n1 + n2 - p - 1, lower.tail = FALSE) # ----------------- 联合置信区间 ----------------- mu_L = as.matrix(xbar_1 - xbar_2) - sqrt(qf(0.05, p, n1 + n2 - p - 1, lower.tail = FALSE) * (p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1)) * as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5]))) mu_R = as.matrix(xbar_1 - xbar_2) + sqrt(qf(0.05, p, n1 + n2 - p - 1, lower.tail = FALSE) * (p * (n1 + n2)^2) / (n1 * n2 * (n1 + n2 - p - 1)) * as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5]))) data.frame(mu_L = mu_L, mu_R = mu_R) # ----------------- 用 X4 分组后变量 X5, X8, X9, X12, X13, X14 均值向量的比较----------------- boston_sub_group3 = subset(boston, boston$chas == 1, select = c(5, 8, 9, 12, 13, 14)) boston_sub_group4 = subset(boston, boston$chas == 0, select = c(5, 8, 9, 12, 13, 14)) n3 = dim(boston_sub_group3)[1] p = dim(boston_sub_group3)[2] n4 = dim(boston_sub_group4)[1] xbar_3 = sapply(boston_sub_group3, mean) xbar_4 = sapply(boston_sub_group4, mean) S_3 = cov(boston_sub_group3) * (n3 - 1) / n3 S_4 = cov(boston_sub_group4) * (n4 - 1) / n4 S = (n3 * S_3 + n4 * S_4) / (n3 + n4) S = as.matrix(S) S_inv = solve(S) F_test = t(as.matrix(xbar_3 - xbar_4)) %*% S_inv %*% as.matrix(xbar_3 - xbar_4) * n3 * n4 * (n3 + n4 - p - 1) / (p * (n3 + n4)^2) F_test qf(0.05, p, n3 + n4 - p - 1, lower.tail = FALSE) # ----------------- 联合置信区间 ----------------- mu_L = as.matrix(xbar_3 - xbar_4) - sqrt(qf(0.05, p, n3 + n4 - p - 1, lower.tail = FALSE) * (p * (n3 + n4)^2) / (n3 * n4 * (n3 + n4 - p - 1)) * as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5], S[6, 6]))) mu_R = as.matrix(xbar_3 - xbar_4) + sqrt(qf(0.05, p, n3 + n4 - p - 1, lower.tail = FALSE) * (p * (n3 + n4)^2) / (n3 * n4 * (n3 + n4 - p - 1)) * as.matrix(c(S[1, 1], S[2, 2], S[3, 3], S[4, 4], S[5, 5], S[6, 6]))) data.frame(mu_L = mu_L, mu_R = mu_R) # ----------------- 检验线性约束 ----------------- X_14_lm = lm(medv ~ . , data = boston) summary(X_14_lm) par(mfrow = c(2, 3)) plot(X_14_lm, which = 1:6) # ----------------- 全部回归系数的检验 ----------------- X = as.matrix(model.matrix(X_14_lm)) n = dim(boston)[1] p = dim(boston)[2] - 1 q = p beta_hat = as.matrix(X_14_lm$coefficients) y = as.matrix(boston$medv) A = array(0, dim = c(13, 14)) for (i in 1:13) A[i, i+1] = 1 F_test = (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% (A %*% beta_hat) / t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)) * (n - p) / q F_test qf(0.05, q, n - p, lower.tail = FALSE) # ----------------- 是否河景房 X4 对房价影响的检验 ----------------- q = 1 A = array(0, dim = c(1, 14)) A[5] = 1 A = as.matrix(A) F_test = (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% (A %*% beta_hat) / t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)) * (n - p) / q F_test qf(0.05, q, n - p, lower.tail = FALSE) # ----------------- 变量 X1, X2, X3, X7 的显著性检验 ----------------- q = 4 A = array(0, dim = c(4, 14)) for (i in 1:3) A[i, i+1] = 1 A[4, 8] = 1 A = as.matrix(A) F_test = (t(A %*% beta_hat) %*% solve(A %*% solve(t(X) %*% X) %*% t(A)) %*% (A %*% beta_hat) / t(y - X %*% beta_hat) %*% (y - X %*% beta_hat)) * (n - p) / q F_test qf(0.05, q, n - p, lower.tail = FALSE) # ----------------- 剔除变量 X1, X2, X3, X7 的简化模型 ----------------- X_14_lm_reduced = lm(medv ~ chas + nox + rm + dis + rad + tax + ptratio + black + lstat , data = boston) summary(X_14_lm_reduced)