返回

Chapter_03.R

5.1 KB · R · 2026-06-07 09:06
rm(list = ls(all = TRUE)) # 清空当前环境
graphics.off() # 清空绘图区域
## ---------------------------- Covariance matrix for the bank data set ----------------------------
library(mclust)
data(banknote)
str(banknote)
head(banknote)
S <- cov(banknote[, 2:7])
round(S, digits = 2)

library(ggplot2)
fig_1 <- ggplot(data = banknote, aes(x = Left, y = Diagonal, color = Status, size = 1.5)) +
  geom_point()
fig_1
fig_2 <- ggplot(data = banknote, aes(x = Right, y = Top, color = Status, size = 1.5)) +
  geom_point()
fig_2

counterfeit <- subset(banknote, Status == "counterfeit")
S_f <- cov(counterfeit[, 2:7])
round(S_f, digits = 3)

fig_3 <- ggplot(data = counterfeit, aes(x = Left, y = Diagonal, size = 1.5, color = 'orange')) +
  geom_point()
fig_3
fig_4 <- ggplot(data = counterfeit, aes(x = Right, y = Top, size = 1.5, color = 'orange')) +
  geom_point()
fig_4

genuine <- subset(banknote, Status == "genuine")
S_g <- cov(genuine[, 2:7])
round(S_g, digits = 3)

fig_5 <- ggplot(data = genuine, aes(x = Left, y = Diagonal, size = 1.5)) +
  geom_point()
fig_5
fig_6 <- ggplot(data = genuine, aes(x = Right, y = Top, size = 1.5,)) +
  geom_point()
fig_6

################################ Correlation matrix ###############################
R_g <- cor(genuine[, 2:7])
round(R_g, digits = 2)

R_f <- cor(counterfeit[, 2:7])
round(R_f, digits = 2)

library(corrgram)
str(auto)
head(auto)
cor(auto$MPG, auto$Weight)

library(ggplot2)
ggplot(auto, aes(x = Origin, y = MPG)) + 
  geom_boxplot()

ggplot(auto, aes(x = MPG, y = Weight, color = Origin, shape = Origin, size = 2)) + 
  geom_point()

w <- log((1 + cor(auto$MPG, auto$Weight)) / (1 - cor(auto$MPG, auto$Weight))) / 2
w
z <- (w - 0) / sqrt(1/(74-3))
z

p_value <- 2 * pnorm(abs(z), 0, 1, lower.tail = FALSE)
p_value

E_w <- log((1 + (-0.75)) / (1 - (-0.75))) / 2
E_w
z1 <- (w - (E_w)) / sqrt(1/(74-3))
z1
p_value <- 2 * pnorm(abs(z1), 0, 1, lower.tail = FALSE)
p_value

x <- c(230, 125, 200, 109, 181, 99, 55, 107, 165, 97, 105, 98, 150, 115, 85, 71, 97, 120, 0, 82,
  192, 100, 150, 103, 181, 80, 85, 111, 189, 90, 120, 93, 172, 95, 110, 86, 170, 125, 130, 78)
pullover <- matrix(x, ncol = 4, byrow = TRUE)
pullover <- as.data.frame(pullover)
colnames(pullover) <- c("X1", "X2", "X3", "X4")
pullover
ggplot(pullover, aes(x = X4, y = X1, size = 1.5)) + 
  geom_point()
r_14 <- cor(pullover$X1, pullover$X4)
r_14

w <- 0.5 * (log((1 + r_14)/(1 - r_14)))
w

w_star <- w - (3 * w + tanh(w)) / (4 * (10 - 1))
w_star

z <- (w_star - 0) / (sqrt(1/(10-1)))
z

p_value <- 2 * pnorm(abs(z), 0, 1, lower.tail = FALSE)
p_value

################################ Summary Statistics ###############################
pullover
pullover.mean <- sapply(pullover, mean)
pullover.mean

pullover <- as.matrix(pullover)
pullover.mean <- as.matrix(pullover.mean)
pullover.cov <- (1/10) * t(pullover) %*% pullover - pullover.mean %*% t(pullover.mean)
pullover.cov

pullover.cov.u <- (10/9) * pullover.cov
pullover.cov.u

pullover.cov.u2 <- cov(pullover)
pullover.cov.u2

pullover.cor <- cor(pullover)
round(pullover.cor, digits = 2)

pullover
A <- matrix(c(0, 0, 1, 10), nrow=1)
A %*% pullover.mean

S_Y <- A %*% pullover.cov.u2 %*% t(A)
S_Y

################################ Linear Model for Two Variables ###############################
pullover = as.data.frame(pullover)
plot(X1 ~ X2, data = pullover, xlab = "Price (X2)", ylab = "Sales (X1)", pch = 16, cex = 1.5)
pullover.lm <- lm(X1 ~ X2, data = pullover)
abline(pullover.lm, col = "red", lwd = 2)
summary(pullover.lm)

################################ Simple Analysis of Variance ###############################
Sales <- c(9, 11, 10, 12, 7, 11, 12, 10, 11, 13, 10, 15, 11, 15, 15, 13, 7, 15, 13, 10, 18, 
           14, 17, 9, 14, 17, 16, 14, 17, 15)
Strategy <- as.factor(c(rep(1, 10), rep(2, 10), rep(3, 10)))
Pullover <- data.frame(Sales = Sales, Strategy = Strategy)
Pullover

library(ggplot2)
p1 <- ggplot(data = Pullover) +
  geom_boxplot(aes(x = Strategy, y = Sales, fill = Strategy))
p1

library(dplyr)
group_by(Pullover, Strategy) %>%
  summarise(
    n_i = n(),
    hat_mean_i = mean(Sales),
    hat_sd_i = sd(Sales)
  )

sales.lm <- lm(Sales ~ Strategy, data = Pullover)
summary(sales.lm)

anova(sales.lm)

sales.aov <- aov(Sales ~ Strategy, data = Pullover)
summary(sales.aov)

################################ Multiple Linear Model ###############################
pullover
pullover.lm2 <- lm(X1 ~ ., data = pullover)
summary(pullover.lm2)

################################ Boston Housing ###############################
library(MASS)
str(Boston)
head(Boston)
?Boston

Boston_mean <- sapply(Boston, mean)
Boston_median <- sapply(Boston, median)
Boston_var <- sapply(Boston, var)
Boston_sd <- sapply(Boston, sd)
Boston_descriptive <- tibble(Mean = Boston_mean, Median = Boston_median, Variance = Boston_var, 
                                 Std = Boston_sd)
Boston_descriptive

S <- cov(Boston)
round(S, digits = 2)

R <- cor(Boston)
round(R, digits = 2)

R_14 <- R[14, 1:13]
W_14 <- 0.5 * log((1 + R_14) / (1 - R_14)) ## Fisher's Z-transformation
W_14
Z_14 <- (W_14 - 0) / sqrt(1/(dim(Boston)[1]-3)) ## Z statistics
Z_14
p_value <- 2 * pnorm(abs(Z_14), lower.tail = FALSE) ## testing H_0: r = 0
p_value

Boston.lm <- lm(medv ~ ., data = Boston)
summary(Boston.lm)