返回

Chapter_01.R

18.3 KB · R · 2026-06-07 09:06
# ------------------------------- Boxplot -------------------------------
rm(list = ls(all = TRUE))
graphics.off()

library(mclust)
data(banknote)
str(banknote)
head(banknote)
x = banknote$Length
summary(x)
quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1))
boxplot(x)

library(ggplot2)
ggplot(data = banknote, aes(x = Status, y = Length)) +
  geom_boxplot(notch = TRUE)

ggplot(data = banknote, aes(x = Status, y = Diagonal)) +
  geom_boxplot(notch = FALSE)

# ------------------------------- Histogram -------------------------------
par(mfrow=c(2, 2))
x_0 = 137.8
h1 = 0.1
breaks.vec1 <- seq(from = x_0, by = h1, length.out = 30)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec1, 
     xlab = 'h = 0.1', ylab = '', main = '')
h2 = 0.2
breaks.vec2 <- seq(from = x_0, by = h2, length.out = 16)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec2, 
     xlab = 'h = 0.2', ylab = '', main = '')
h3 = 0.3
breaks.vec3 <- seq(from = x_0, by = h3, length.out = 11)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec3, 
     xlab = 'h = 0.3', ylab = '', main = '')
h4 = 0.4
breaks.vec4 <- seq(from = x_0, by = h4, length.out = 8)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec4, 
     xlab = 'h = 0.4', ylab = '', main = '')

par(mfrow=c(2, 2))
x_0 = 137.45
breaks.vec5 <- seq(from = x_0, by = h4, length.out = 9)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec5, 
     xlab = 'x0 = 137.45', ylab = '', main = '')
x_0 = 137.55
breaks.vec6 <- seq(from = x_0, by = h4, length.out = 9)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec6, 
     xlab = 'x0 = 137.55', ylab = '', main = '')
x_0 = 137.65
breaks.vec7 <- seq(from = x_0, by = h4, length.out = 9)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec7, 
     xlab = 'x0 = 137.65', ylab = '', main = '')
x_0 = 137.75
breaks.vec8 <- seq(from = x_0, by = h4, length.out = 9)
hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec8, 
     xlab = 'x0 = 137.75', ylab = '', main = '')

# ------------------------------- Kernel Densities -------------------------------
par(mfrow=c(2, 2))
s = function(x) (3/4) * (1-x^2)
h = 1/2
g = function(x) (1/h) * s(x/h)
curve(s, -1, 1, xlim=c(-1.5, 1.5), ylim=c(0, 5), xlab="", ylab="Epanechnikov Kernel", col=2)
lines(c(-1.5, -1), c(0, 0), col=2)
lines(c(1, 1.5), c(0, 0), col=2)
abline(v=0, lty=2)
curve(g, -h, h, col=4, add=TRUE)
lines(c(-1, -0.5), c(0, 0), col=4)
lines(c(0.5, 1), c(0, 0), col=4)
lines(c(-0.5, 0.5), c(0, 0), lty=2)

x = c(-1.5, -0.8, -0.2, 0, 0.2)
f = function(t) (3/4) * (1-t^2)
h = 1/2
g1 = function(t) (1/h) * f((t-x[1])/h)
g2 = function(t) (1/h) * f((t-x[2])/h)
g3 = function(t) (1/h) * f((t-x[3])/h)
g4 = function(t) (1/h) * f((t-x[4])/h)
g5 = function(t) (1/h) * f((t-x[5])/h)
curve(g1, x[1]-h, x[1]+h, xlim=c(-2, 1), ylim=c(0, 5), xlab="", ylab="", lty=1)
curve(g2, x[2]-h, x[2]+h, add=TRUE, col=2, lty=1)
curve(g3, x[3]-h, x[3]+h, add=TRUE, col=3, lty=1)
curve(g4, x[4]-h, x[4]+h, add=TRUE, col=4, lty=1)
curve(g5, x[5]-h, x[5]+h, add=TRUE, col=5, lty=1)
abline(h=0, lty=2)
for (i in 1:5) lines(c(x[i], x[i]), c(0, 1.5), lty=2, col=i)
for (i in 1:5) points(x[i], 0, cex=0.8, pch=16, col=i)

h1 = function(t) g1(t)
curve(h1, -2, -1.3, xlim=c(-2, 1), ylim=c(0, 5), xlab="", ylab="")
h2 = function(t) (g1(t) + g2(t))
curve(h2, -1.3, -1, add=TRUE)
h3 = function(t) g2(t)
curve(h3, -1, -0.7, add=TRUE)
h4 = function(t) (g2(t) + g3(t))
curve(h4, -0.7, -0.5, add=TRUE)
h5 = function(t) (g2(t) + g3(t) + g4(t))
curve(h5, -0.5, -0.3, add=TRUE)
h6 = function(t) (g3(t) + g4(t) + g5(t))
curve(h6, -0.3, 0.3, add=TRUE)
h7 = function(t) (g4(t) + g5(t))
curve(h7, 0.3, 0.5, add=TRUE)
h8 = function(t) g5(t)
curve(h8, 0.5, 0.7, add=TRUE)
abline(h=0, lty=2)

f1 = function(t) (1/5) * g1(t)
curve(f1, -2, -1.3, xlim=c(-2, 1), ylim=c(0, 5), xlab="", ylab="")
f2 = function(t) (1/5) * (g1(t) + g2(t))
curve(f2, -1.3, -1, add=TRUE)
f3 = function(t) (1/5) * g2(t)
curve(f3, -1, -0.7, add=TRUE)
f4 = function(t) (1/5) * (g2(t) + g3(t))
curve(f4, -0.7, -0.5, add=TRUE)
f5 = function(t) (1/5) * (g2(t) + g3(t) + g4(t))
curve(f5, -0.5, -0.3, add=TRUE)
f6 = function(t) (1/5) * (g3(t) + g4(t) + g5(t))
curve(f6, -0.3, 0.3, add=TRUE)
f7 = function(t) (1/5) * (g4(t) + g5(t))
curve(f7, 0.3, 0.5, add=TRUE)
f8 = function(t) (1/5) * g5(t)
curve(f8, 0.5, 0.7, add=TRUE)
abline(h=0, lty=2)

par(mfrow=c(1, 1))
f1 = function(t) (1/5) * g1(t)
curve(f1, -2, -1.3, xlim=c(-2, 1), ylim=c(0, 1), xlab="", ylab="")
f2 = function(t) (1/5) * (g1(t) + g2(t))
curve(f2, -1.3, -1, add=TRUE)
f3 = function(t) (1/5) * g2(t)
curve(f3, -1, -0.7, add=TRUE)
f4 = function(t) (1/5) * (g2(t) + g3(t))
curve(f4, -0.7, -0.5, add=TRUE)
f5 = function(t) (1/5) * (g2(t) + g3(t) + g4(t))
curve(f5, -0.5, -0.3, add=TRUE)
f6 = function(t) (1/5) * (g3(t) + g4(t) + g5(t))
curve(f6, -0.3, 0.3, add=TRUE)
f7 = function(t) (1/5) * (g4(t) + g5(t))
curve(f7, 0.3, 0.5, add=TRUE)
f8 = function(t) (1/5) * g5(t)
curve(f8, 0.5, 0.7, add=TRUE)
abline(h=0, lty=2)
for (i in 1:5) points(x[i], 0, cex=1.5, pch=21, bg='orange')

## Example: Swiss banknotes
par(mfrow=c(2, 2))
plot(density(banknote$Diagonal, bw = 0.04), xlab='Bandwidth = 0.04', main='Swiss Bank Notes', 
     ylab='Diagonal')
plot(density(banknote$Diagonal, bw = 0.15), xlab='Bandwidth = 0.15', main='Swiss Bank Notes', 
     ylab='Diagonal')
plot(density(banknote$Diagonal, bw = 0.4), xlab='Bandwidth = 0.4', main='Swiss Bank Notes', 
     ylab='Diagonal')
plot(density(banknote$Diagonal, bw = 1), xlab='Bandwidth = 1', main='Swiss Bank Notes', 
     ylab='Diagonal')

library(ggplot2)
ggplot(banknote, aes(Diagonal, fill = Status, color = Status)) +
  geom_density(alpha = 0.2) +
  xlim(137, 143)

ggplot(banknote, aes(x = Top, y = Diagonal)) +
  xlim(8, 13)  +
  ylim(137, 143) +
  geom_density_2d(na.rm = TRUE)

# ------------------------------- Scatter plots -------------------------------
ggplot(banknote, aes(x = Top,  y = Diagonal, colour = Status, shape = Status)) +
  xlim(7, 13)  +
  ylim(137, 143) +
  geom_point(size = 2.5) +
  geom_abline(intercept = 136.9, slope = 1/3, colour = 'purple',size = 1.5)

library(plotly)
plot_ly(banknote, x =~ Bottom, y =~ Top, z =~ Diagonal) %>%
  add_markers(color =~ Status, symbol =~ Status)

library(GGally)
ggpairs(banknote, columns = 2:7, ggplot2::aes(colour=Status, alpha = 0.2))

# ------------------------------- Chernoff Faces -------------------------------
library(TeachingDemos)
faces(banknote[91:110, 2:7], fill = TRUE)
faces(banknote[1:100, 2:7], ncol=10, fill = TRUE)
faces(banknote[101:200, 2:7], ncol=10, fill = TRUE)

# library(aplpack)
# faces(banknote[91:110,], face.type = 1)

# ------------------------------- Andrews’ Curves -------------------------------
library(pracma)
A1 = as.matrix(banknote[96:105, 2:7])
andrewsplot(scale(A1), f = banknote$Status[96:105], style = 'cart')

A2 = as.matrix(banknote[96:105, 7:2])
andrewsplot(scale(A2), f = banknote$Status[96:105], style = 'cart')

andrewsplot(scale(A1), f = banknote$Status[96:105], style = 'pol')

andrewsplot(scale(A2), f = banknote$Status[96:105], style = 'pol')

A3 = as.matrix(banknote[1:200, 2:7])
andrewsplot(scale(A3), f = banknote$Status[1:200], style = 'cart')

andrewsplot(scale(A3), f = banknote$Status[1:200], style = 'pol')

A4 = as.matrix(banknote[1:200, 7:2])
andrewsplot(scale(A4), f = banknote$Status[1:200], style = 'cart')

andrewsplot(scale(A4), f = banknote$Status[1:200], style = 'pol')

# ------------------------------- Parallel Coordinate Plots -------------------------------
library(GGally)
banknote.sub = banknote[96:105,]
ggparcoord(data = banknote.sub, columns = 2:7, groupColumn = 1, scale = 'uniminmax')
ggparcoord(data = banknote.sub, columns = 7:2, groupColumn = 1, scale = 'uniminmax')

ggparcoord(data = banknote, columns = 2:7, groupColumn = 1, scale = 'uniminmax')
ggparcoord(data = banknote, columns = 7:2, groupColumn = 1, scale = 'uniminmax')

# ------------------------------- Hexagon Plots -------------------------------
library(ggplot2)
ggplot(data = banknote, aes(x = Diagonal, y = Top, z = Length)) +
  stat_summary_hex()

?diamonds
ggplot(data = diamonds, aes(x = depth, y = carat, z = price)) +
  stat_summary_hex()

# ------------------------------- Boston Housing -------------------------------
library(MASS)
str(Boston)
head(Boston)

# PCP (平行坐标图)
library(GGally)
c = Boston$medv > median(Boston$medv) # 按 X_14 是否大于其中位数分为两类
boston = cbind(Boston, c)
ggparcoord(data = boston, columns = 1:14, groupColumn = 15, scale = 'uniminmax')

# 全部14个变量的散点图矩阵
ggpairs(boston, columns = 1:14, ggplot2::aes(colour = c, alpha = 0.2))


# 变量 4, 6, 11, 12, 13, 14 的散点图矩阵
ggpairs(boston, columns = c(4, 6, 11, 12, 13, 14), ggplot2::aes(colour = c, alpha = 0.2))

# 变量 1:5, 14 的散点图矩阵
ggpairs(boston, columns = c(1:5, 14), ggplot2::aes(colour = c, alpha = 0.2))

library(magrittr)
library(gridExtra)
# ----------------- 变量 X_1 -----------------
# 变量 X_1 的箱线图
Box_X1 = ggplot(data = boston, aes(y = crim)) + geom_boxplot(notch = FALSE)

# 变量 X_1 取对数之后的箱线图
Box_Log_X1 = ggplot(data = boston, aes(y = log(crim))) + geom_boxplot(notch = FALSE)
grid.arrange(Box_X1, Box_Log_X1, ncol = 2)

# 变量 X_1 依房价的分类箱线图
Boxc_X1 = ggplot(data = boston, aes(x = c, y = crim)) +
  geom_boxplot(notch = FALSE)

# 变量 X_1 取对数之后的分类箱线图
Boxc_Log_X1 = ggplot(data = boston, aes(x = c, y = log(crim))) +
  geom_boxplot(notch = FALSE)
grid.arrange(Boxc_X1, Boxc_Log_X1, ncol = 2)

# 对数变换之后依房价的分类核密度图
ggplot(boston, aes(log(crim), fill = c, color = c)) +
  geom_density(alpha = 0.2)

# 变量 1:5, 14 变换之后的散点图矩阵
Log_X1 = log(Boston$crim)
X2 = Boston$zn / 10
Log_X3 = log(Boston$indus)
Log_X5 = log(Boston$nox)
Log_X14 = log(Boston$medv)
boston_sub = data.frame(crim = Log_X1, zn = X2, indus = Log_X3, 
              chas = Boston$chas, nox = Log_X5, medv = Log_X14, c = boston$c)
ggpairs(boston_sub, columns = 1:6, ggplot2::aes(colour = c, alpha = 0.2))

# 变量 log(X_1) 与 log(X_14) 依房价分类的散点图
ggplot(boston_sub, aes(x = crim, y = medv, color = c)) +
  geom_point() +
  xlab("log(crim)") +
  ylab("log(medv)")

# ----------------- 变量 X_2 -----------------
# 变量1:5, 14的散点图矩阵
ggpairs(boston, columns = c(1:5, 14), ggplot2::aes(colour = c, alpha = 0.2))

# 变量2的对数依变量14(房价)的对数的分类箱线图
ggplot(data = boston_sub, aes(x = c, y = zn)) +
  geom_boxplot(notch = FALSE)

# ----------------- 变量 X_3 -----------------
# 变量3与变量14的平行坐标图
ggparcoord(data = boston, columns = 1:14, groupColumn = 15, scale = 'uniminmax')

# 变量3与变量14的散点图
ggplot(boston, aes(x = indus, y = medv, color = c)) +
  geom_point()

# 变量3与变量14经对数变换之后的散点图
ggplot(boston, aes(x = log(indus), y = log(medv), color = c)) +
  geom_point()

# 变量3的核密度图
ggplot(boston, aes(indus)) + geom_density(linewidth = 1, colour = 'steelblue')

# ----------------- 变量 X_4 -----------------
# Boston房屋数据集的PCP (平行坐标图)
ggparcoord(data = boston, columns = 1:14, groupColumn = 15, scale = 'uniminmax')

# 变量 1:5, 14 的散点图矩阵
ggpairs(boston, columns = c(1:5, 14), ggplot2::aes(colour = c, alpha = 0.2))

# 校正后的数据集
library(mlbench)
data(BostonHousing2)
head(BostonHousing2)
subset(BostonHousing2, chas == 1)

# ----------------- 变量 X_5 -----------------
# 变量5与变量14的散点图
ggplot(boston, aes(x = nox, y = medv, color = c)) +
  geom_point()

# 变量5依房价的分类箱线图
ggplot(data = boston, aes(x = c, y = nox)) +
  geom_boxplot(notch = FALSE)

# ----------------- 变量 X_6 -----------------
# 变量6与变量14的散点图
ggplot(boston, aes(x = rm, y = medv, color = c)) +
  geom_point()

# 变量6依房价的分类箱线图
ggplot(data = boston, aes(x = c, y = rm)) +
  geom_boxplot(notch = FALSE)

# ----------------- 变量 X_7 -----------------
# 变量7与变量14的散点图
ggplot(boston, aes(x = age, y = medv, color = c)) +
  geom_point()

# 变量7依房价的分类箱线图
ggplot(data = boston, aes(x = c, y = age)) +
  geom_boxplot(notch = TRUE, fill = "cyan", colour = "orangered2")

# 变量7变换后的核密度图
Y7 = Boston$age^(2.5) / 10000
boston_sub = cbind(boston_sub, age_new = Y7)
ggplot(boston_sub, aes(age_new)) + geom_density(linewidth = 1, colour = 'steelblue')

# 变量7与8的散点图
ggplot(boston, aes(x = age, y = dis, color = c)) +
  geom_point()

# ----------------- 变量 X_8 -----------------
# 变量8与变量14的散点图
ggplot(boston, aes(x = dis, y = medv, color = c)) +
  geom_point()

# 变量8依房价的分类箱线图
ggplot(data = boston, aes(x = c, y = dis)) +
  geom_boxplot(notch = TRUE, fill = "cyan", colour = "orangered2")

# ----------------- 变量 X_9 -----------------
# 变量9与14的散点图
p9_1 = ggplot(boston, aes(x = rad, y = medv)) +
  geom_point()

# 变量9直方图
p9_2 = ggplot(boston, aes(x = rad)) +
  geom_histogram(binwidth = 1)

# 变量9的核密度图
p9_3 = ggplot(boston, aes(x = rad)) +
  geom_density()

grid.arrange(p9_1, p9_2, p9_3, ncol = 3)

# 变量9与变量14的散点图
p9_4 = ggplot(boston, aes(x = rad, y = medv, color = c)) +
  geom_point()

# 变量9依变量14的箱线图
p9_5 = ggplot(boston, aes(x = c, y = rad)) +
  geom_boxplot(notch = FALSE, fill = "cyan", colour = "orangered2")

# 变量9依变量14的分类核密度图
p9_6 = ggplot(boston, aes(x = rad, fill = c, colour = c)) +
  geom_density(alpha = 0.2)

grid.arrange(p9_4, p9_5, p9_6, ncol = 3)

# ----------------- 变量 X_10 -----------------
# 变量10与14的散点图
p10_1 = ggplot(boston, aes(x = tax, y = medv)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.85, linewidth = 1.5, colour = 'red3')

# 变量10的核密度图
p10_2 = ggplot(boston, aes(x = tax)) +
  geom_density()

grid.arrange(p10_1, p10_2, ncol = 2)

# 变量10依变量14的箱线图
p10_3 = ggplot(boston, aes(x = c, y = tax)) +
  geom_boxplot(notch = FALSE, fill = "cyan", colour = "orangered2")

grid.arrange(p10_1, p10_3, ncol = 2)

# ----------------- 变量 X_11 -----------------
# 变量11依变量14的箱线图
p11_1 = ggplot(boston, aes(x = c, y = ptratio)) +
  geom_boxplot(notch = FALSE, fill = "cyan", colour = "orangered2")
p11_1

# 变量11依14的分类散点图
p11_2 = ggplot(boston, aes(x = ptratio, y = medv, colour = c)) +
  geom_point() + 
  geom_smooth(method = 'lm', linewidth = 1.5, colour = 'blue', se = FALSE)
p11_2

# ----------------- 变量 X_12 -----------------
# 变量12与3,7,11,14的散点图矩阵
p12_3 = ggplot(boston, aes(x = black, y = indus)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

p12_7 = ggplot(boston, aes(x = black, y = age)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

p12_11 = ggplot(boston, aes(x = black, y = ptratio)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

p12_14 = ggplot(boston, aes(x = black, y = medv)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

grid.arrange(p12_3, p12_7, p12_11, p12_14, ncol = 2)

# 变量12的箱线图、描述性统计量值
ggplot(boston, aes(y = black)) +
  geom_boxplot(notch = FALSE, fill = "white", colour = "steelblue")
summary(boston$black)

# 变量12取值位于上、下四分位数之外的个数
dim(boston)
sum(boston$black < 375)
sum(boston$black > 397)

# 观测数据405至470的变量12的取值
boston$black[405:470]

# 变量12对数值的散点图
black_log = log(boston$black)
boston = cbind(boston, black_log = black_log)
p12_log = ggplot(boston, aes(x = black_log, y = medv)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

p12_medv = ggplot(boston, aes(x = black, y = medv)) +
  geom_point()

grid.arrange(p12_log, p12_medv, ncol = 2)

# 变量12与3的散点图
ggplot(boston, aes(x = black, y = indus)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

# 变量12与7的散点图
ggplot(boston, aes(x = black, y = age)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

# 变量12与11的散点图
ggplot(boston, aes(x = black, y = ptratio)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

# 变量12与14的散点图
ggplot(boston, aes(x = black, y = medv, colour = c)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3')

# ----------------- 变量 X_13 -----------------
# 变量13与14的散点图
p13_1 = ggplot(boston, aes(x = lstat, y = medv, colour = c)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, span = 0.8, linewidth = 1.5, colour = 'blue')

# 变量13的箱线图
p13_2 = ggplot(boston, aes(y = lstat)) +
  geom_boxplot(notch = TRUE, fill = "white", colour = "blue")

grid.arrange(p13_1, p13_2, ncol = 2)

# 变量13开平方根与14取对数的散点图
lstat_root = sqrt(boston$lstat)
medv_log = log(boston$medv)
boston = cbind(boston, lstat_root = lstat_root, medv_log = medv_log)
ggplot(boston, aes(x = lstat_root, y = medv_log, colour = c)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE, linewidth = 1.5, colour = 'blue')

# ----------------- 变量变换 -----------------
trans_X1 = log(boston$crim)
trans_X2 = boston$zn / 10
trans_X3 = log(boston$indus)
trans_X5 = log(boston$nox)
trans_X6 = log(boston$rm)
trans_X7 = boston$age^(2.5) / 10000
trans_X8 = log(boston$dis)
trans_X9 = log(boston$rad)
trans_X10 = log(boston$tax)
trans_X11 = (exp(0.4 * boston$ptratio)) / 1000
trans_X12 = boston$black / 100
trans_X13 = sqrt(boston$lstat)
trans_X14 = log(boston$medv)

trans_boston = data.frame(trans_X1, trans_X2, trans_X3, boston$chas, trans_X5, trans_X6,
                          trans_X7, trans_X8, trans_X9, trans_X10, trans_X11, trans_X12,
                          trans_X13, trans_X14)
names(trans_boston) = names(Boston)

par(mfrow = c(2, 1))
boxplot(Boston, main = 'Boston Housing Data', axes = FALSE, boxwex = 0.5)
axis(2)
boxplot(trans_boston, main = 'Transformed Boston Housing Data', axes = FALSE, boxwex = 0.5)
axis(2)