Chapter_13.R · 芝士酷

############################################ Car Mark data set #####################################
# ----- clear variables and close windows -----
rm(list = ls(all = TRUE))
graphics.off()

library(data.table)
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
Carmean = fread("Carmean2N.csv", header = TRUE)
str(Carmean)
head(Carmean)
x = Carmean[, 2:9]
x = as.matrix(x)
head(x)

nr_x = dim(x)[1] # 观测值个数（行数）
nr_x
nc_x = dim(x)[2] # 变量个数（列数）
nc_x

colmean_x = apply(x, 2, mean) # 计算每个变量的平均值
round(colmean_x, digits = 3)

# 定义二元数据矩阵：大于各变量均值取 1，否则取 0
for (i in 1:nr_x) {
  for (j in 1:nc_x) {
    if (x[i, j] > colmean_x[j]) x[i, j] = 1
    else x[i, j] = 0
  }
}

x[17:19, ] # 以 17~19 这三行为例

############################################ French Food data set #####################################
# clear variables and close windows
rm(list = ls(all = TRUE))
graphics.off()
library(data.table)
x = fread("food.csv", header = TRUE)
(x = x[, 2:7])

D_1 = dist(x, method = "euclidean", diag = TRUE, upper = FALSE, p = 2)
round(D_1, digits = 2)

sx = scale(x)
D_2 = dist(sx, method = "euclidean", diag = TRUE, upper = FALSE, p = 2)
round(D_2, digits = 2)

############################################ 8 points - k-means clustering #################################
# clear all variables
rm(list = ls(all = TRUE))

# create matrix from 8 data points and define eight points
eight = cbind(c(-3, -2, -2, -2, 1, 1, 2, 4), c(0, 4, -1, -2, 4, 2, -4, -3))
eight

eight = eight[c(8, 7, 3, 1, 4, 2, 6, 5), ] # 数据集重新排序
eight

# Plot 1: The 8-point example with k-means
graphics.off()
plot(eight, type = "n", xlab = "", ylab = "", xlim = c(-4, 4), ylim = c(-4, 4), main = "8 points", asp = 1)
p12 = eight[1:2, ]
p12.mean = apply(p12, 2, mean)
segments(p12.mean[1], p12.mean[2], eight[1, 1], eight[1, 2], lwd = 2, col = 'orange')
segments(p12.mean[1], p12.mean[2], eight[2, 1], eight[2, 2], lwd = 2, col = 'orange')
p3_8 = eight[3:8, ]
p3_8.mean = apply(p3_8, 2, mean)
segments(p3_8.mean[1], p3_8.mean[2], eight[3, 1], eight[3, 2], lwd = 2, col = 'green3')
segments(p3_8.mean[1], p3_8.mean[2], eight[4, 1], eight[4, 2], lwd = 2, col = 'green3')
segments(p3_8.mean[1], p3_8.mean[2], eight[5, 1], eight[5, 2], lwd = 2, col = 'green3')
segments(p3_8.mean[1], p3_8.mean[2], eight[6, 1], eight[6, 2], lwd = 2, col = 'green3')
segments(p3_8.mean[1], p3_8.mean[2], eight[7, 1], eight[7, 2], lwd = 2, col = 'green3')
segments(p3_8.mean[1], p3_8.mean[2], eight[8, 1], eight[8, 2], lwd = 2, col = 'green3')
segments(p12.mean[1], p12.mean[2], p3_8.mean[1], p3_8.mean[2], lwd = 2, col = 'grey')
points(p12.mean[1], p12.mean[2], pch = 16, cex = 1)
points(p3_8.mean[1], p3_8.mean[2], pch = 16, cex = 1)
points(eight, pch = 21, cex = 2.7, bg = "white")
text(eight, as.character(1:8), col = "red3", cex = 1)

# Plot 2: The 8-point example with k-medoids
graphics.off()
plot(eight, type = "n", xlab = "", ylab = "", xlim = c(-4, 4), ylim = c(-4, 4), main = "8 points", asp = 1)
segments(eight[3, 1], eight[3, 2], eight[1, 1], eight[1, 2], lwd = 2, col = 'orange')
segments(eight[3, 1], eight[3, 2], eight[2, 1], eight[2, 2], lwd = 2, col = 'orange')
segments(eight[3, 1], eight[3, 2], eight[4, 1], eight[4, 2], lwd = 2, col = 'orange')
segments(eight[3, 1], eight[3, 2], eight[5, 1], eight[5, 2], lwd = 2, col = 'orange')
segments(eight[8, 1], eight[8, 2], eight[6, 1], eight[6, 2], lwd = 2, col = 'green3')
segments(eight[8, 1], eight[8, 2], eight[7, 1], eight[7, 2], lwd = 2, col = 'green3')
points(eight, pch = 21, cex = 2.7, bg = "white")
text(eight, as.character(1:8), col = "red3", cex = 1)

# Plot 3: The 8-point example with k-median
graphics.off()
plot(eight, type = "n", xlab = "", ylab = "", xlim = c(-4, 4), ylim = c(-4, 4), main = "8 points", asp = 1)
p12 = eight[1:2, ]
p12.median = apply(p12, 2, median)
segments(p12.median[1], p12.median[2], eight[1, 1], eight[1, 2], lwd = 2, col = 'orange')
segments(p12.median[1], p12.median[2], eight[2, 1], eight[2, 2], lwd = 2, col = 'orange')
p3_8 = eight[3:8, ]
p3_8.median = apply(p3_8, 2, median)
segments(p3_8.median[1], p3_8.median[2], eight[3, 1], eight[3, 2], lwd = 2, col = 'green3')
segments(p3_8.median[1], p3_8.median[2], eight[4, 1], eight[4, 2], lwd = 2, col = 'green3')
segments(p3_8.median[1], p3_8.median[2], eight[5, 1], eight[5, 2], lwd = 2, col = 'green3')
segments(p3_8.median[1], p3_8.median[2], eight[6, 1], eight[6, 2], lwd = 2, col = 'green3')
segments(p3_8.median[1], p3_8.median[2], eight[7, 1], eight[7, 2], lwd = 2, col = 'green3')
segments(p3_8.median[1], p3_8.median[2], eight[8, 1], eight[8, 2], lwd = 2, col = 'green3')
segments(p12.median[1], p12.median[2], p3_8.median[1], p3_8.median[2], lwd = 2, col = 'grey')
points(p12.median[1], p12.median[2], pch = 16, cex = 1)
points(p3_8.median[1], p3_8.median[2], pch = 16, cex = 1)
points(eight, pch = 21, cex = 2.7, bg = "white")
text(eight, as.character(1:8), col = "red3", cex = 1)

# Plot 4: The 8-point example —— dendrogram
d = dist(eight, method = "euclidean", p = 2, diag = TRUE) # 欧氏距离矩阵
round(d, digits = 2)
s1 = hclust(d, method = "single") # cluster analysis with single linkage algorithm 聚类分析-最短距离法 (single)
graphics.off()
plot(s1, hang = -0.1, frame.plot = TRUE, ann = FALSE)
title(main = "Single Linkage Dendrogram - 8 points", ylab = "Euclidean Distance")
abline(h = 3.5, lty = 2, col = 'cyan3') # cut the tree at distance 3.5

dd = d^2 # 欧氏距离平方的矩阵
round(dd, digits = 2)
s2 = hclust(dd, method = "single") # cluster analysis with single linkage algorithm 聚类分析-最短距离法 (single)
# The dendrogram for the 8-point example, Single linkage algorithm.
graphics.off()
plot(s2, hang = -0.1, frame.plot = TRUE, ann = FALSE)
title(main = "Single Linkage Dendrogram - 8 points", ylab = "Squared Euclidean Distance")

############################################ Randomly select 20 observations from the bank notes data #######################
# clear all variables
rm(list = ls(all = TRUE))
graphics.off()

## Load data
library(mclust)
data = banknote
set.seed(3)
xx = data[sample(1:nrow(data), 20, replace = F), ]  # sample of 20 randomly chosen bank notes from data
xx = xx[, 2:7]
xx

## PCA 主成分分析
mean = as.vector(colMeans(xx)) # mean vector
m = matrix(mean, nrow(xx), NROW(mean), byrow = T)
x = xx - m # centering
eig = eigen(cov(x)) # spectral decomposition
eva = eig$values # eigenvalues
eve = eig$vectors # eigenvectors
xm = as.matrix(x)
y = xm %*% eve # PC values
ym = y[, 1:2] # first two PCs

## Plot 1: PCA
graphics.off()
plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "20 Swiss bank notes", 
     ylim = c(-5, 4), xlim = c(-4, 4.5), asp = 1)
text(ym[, 1], ym[, 2], rownames(xx), cex = 1.2)
abline(h = 0, v = 0, lty = 2, col = 'orange')

## Plot 2: Dendrogram for the 20 bank notes after applying the Ward algorithm
d = dist(xx, method = "euclidean", p = 2) # euclidean distance matrix
dd = d^2 # squared euclidean distance matrix
w = hclust(dd, method = "ward.D") # cluster analysis with ward algorithm
graphics.off()
plot(w, hang = -0.1, frame.plot = TRUE, ann = FALSE)
title(main = "Dendrogram for 20 Swiss bank notes", ylab = "Squared Euclidean Distance", 
      xlab = "Ward algorithm")

## Plot 3: PCA with clusters
library(car)
graphics.off()
groups = cutree(w, h = 60)
merg = matrix(c(ym, as.matrix(groups)), nrow = 20, ncol = 3)
merg = merg[sort.list(merg[, 3]), ]
merg1  = merg[1:6, 1:2]
merg2  = merg[7:20, 1:2]
plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "20 Swiss bank notes, cut height 60", 
     ylim = c(-4, 4), xlim = c(-4, 4), asp = 1)
abline(h = 0, v = 0, lty = 2, col = 'orange')
text(ym[, 1], ym[, 2], rownames(xx), cex = 1.2)
dataEllipse(x = merg1[, 1], y = merg1[, 2], center.pch = 0, col = "red", plot.points = F, 
            add = T, levels = 0.85)
dataEllipse(x = merg2[, 1], y = merg2[, 2], center.pch = 0, col = "blue", plot.points = F, 
            add = T, levels = 0.95)

############################################ French food expenditure data set ############################################
# clear all variables
rm(list = ls(all = TRUE))
graphics.off()

## Load data
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
fooddat = read_csv("food.csv")
food = as.data.frame(fooddat[, -1]) # delete the first column (types of families)
rownames(food) = c("MA2", "EM2", "CA2", "MA3", "EM3", "CA3", "MA4", "EM4", "CA4", "MA5", "EM5", "CA5") # define types of families
f = scale(food) # standardize variables
round(f, digits = 3)

## PCA
mean = as.vector(colMeans(f))
m = matrix(mean, nrow(f), NROW(mean), byrow = T)
x = f - m
eig = eigen(cov(x)) # spectral decomposition  
eva = eig$values
eve = eig$vectors
xm = as.matrix(x)
y = xm %*% eve
ym = y[, 1:2] # first two eigenvectors

## Plot 1: PCA
plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "French Food data", 
     ylim = c(-5, 5), xlim = c(-5, 5), asp = 1)
text(ym[, 1], ym[, 2], rownames(f), cex = 1.2)
abline(h = 0, v = 0, lty = 2, col = 'orange')

## Plot 2: Dendrogram for the standardized food.dat after Ward algorithm
d = dist(f, "euclidean", p = 2) # euclidean distance matrix
dd = d^2 # squared euclidean distance matrix                                  
w = hclust(dd, method = "ward.D") # cluster analysis with ward algorithm
graphics.off()
plot(w, hang = -0.1, frame.plot = TRUE, ann = FALSE)
title(main = "Ward Dendrogram for French Food", ylab = "Squared Euclidean Distance")

## Plot 3: PCA with clusters
groups = cutree(w, h = 60)
merg = matrix(c(ym, as.matrix(groups)), nrow = 12, ncol = 3)
merg = merg[sort.list(merg[, 3]), ]
merg1 = merg[1:7, 1:2]
merg2 = merg[8:12, 1:2]
graphics.off()
plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "French Food data, cut height 60", 
     ylim = c(-5, 4), xlim = c(-5, 4.5), asp = 1)
abline(h = 0, v = 0, lty = 2, col = 'orange')
text(ym[, 1], ym[, 2], rownames(f), cex = 1.2)
dataEllipse(x = merg1[, 1], y = merg1[, 2], center.pch = 0, col = "red", plot.points = F, 
            add = T, levels = 0.8)
dataEllipse(x = merg2[, 1], y = merg2[, 2], center.pch = 0, col = "blue", plot.points = F, 
            add = T, levels = 0.65)

############################################ Adaptive Weights Clustering 自适应权重聚类 ####################################
rm(list = ls(all = TRUE))
eight = cbind(c(-3, -2, -2, -2, 1, 1, 2, 4), c(0, 4, -1, -2, 4, 2, -4, -3))
eight

eight = eight[c(8, 7, 3, 1, 4, 2, 6, 5), ] # 数据集重新排序
eight

d = dist(eight, method = "euclidean", diag = TRUE, upper = TRUE, p = 2) # 计算欧氏距离
D = as.matrix(d) # 距离矩阵
nr = nrow(D) # 矩阵 D 的行数
nc = ncol(D) # 矩阵 D 的列数
W = matrix(0, nrow = nr, ncol = nc) # 权重矩阵的初始值
for (i in 1:nr){
  for (j in 1:nc) {
    if (D[i, j] < 4) W[i, j] = 1
  }
} # 距离小于 4 的归为一类
round(D, digits = 2) # 距离矩阵
W # 权重矩阵

################################################ Spectral Clustering 谱聚类 ################################################
rm(list = ls(all = TRUE))
library(kernlab)
?specc

eight = cbind(c(-3, -2, -2, -2, 1, 1, 2, 4), c(0, 4, -1, -2, 4, 2, -4, -3))
eight

eight = eight[c(8, 7, 3, 1, 4, 2, 6, 5), ] # 数据集重新排序
eight

sc.eight = specc(eight, centers = 2)
sc.eight
centers(sc.eight)
withinss(sc.eight)

merg1 = eight[1:2, 1:2]
merg2 = eight[3:8, 1:2]
plot(eight, col = sc.eight, pch = 16, cex = 2, xlab = '', ylab = '', asp = 1, ylim = c(-6, 6))
dataEllipse(x = merg2[, 1], y = merg2[, 2], center.pch = 0, col = "blue", plot.points = F, 
            add = T, levels = 0.75)
ellipse(center = c(3, -3.5), shape = matrix(c(1, 0.6, 0.6, 1), nrow = 2), center.pch = 0,
        col = "red", radius = 1.2)

################################################ iris 数据集的聚类分析 ################################################
## 系统聚类法 hclust()
?hclust
rm(list = ls(all = TRUE))
x = iris
head(x)
d = dist(x[, 1:4]) # 距离矩阵
iris.hc1 = hclust(d, method = "complete") # 最长距离法
plot(iris.hc1, hang = -1) # 聚类树
re1 = rect.hclust(iris.hc1, k = 3) # 聚类树分成三类
re1 # 分成三类的结果
iris.id1 = cutree(iris.hc1, 3) # 编成三组
table(iris.id1, x$Species) # 分类结果对比

iris.hc2 = hclust(d, method = "centroid") # 重心法
plot(iris.hc2, hang = -1) # 聚类树
re2 = rect.hclust(iris.hc2, k = 3) # 聚类树分成三类
re2 # 分成三类的结果
iris.id2 = cutree(iris.hc2, 3) # 编成三组
table(iris.id2, x$Species) # 分类结果对比

iris.hc3 = hclust(d, method = "median") # 中位数法
plot(iris.hc3, hang = -1) # 聚类树
re3 = rect.hclust(iris.hc3, k = 3) # 聚类树分成三类
re3 # 分成三类的结果
iris.id3 = cutree(iris.hc3, 3) # 编成三组
table(iris.id3, x$Species) # 分类结果对比

################################################ USArrests 数据集的聚类分析 ################################################
## 动态聚类法 kmeans()
rm(list = ls(all = TRUE))
graphics.off()

library(factoextra)
library(cluster)

# 载入数据
df = USArrests

# 移除有缺失值的数据，该数据集没有缺失数据
df = na.omit(df)

# 标准化
df = scale(df)
head(df)

# 聚类数量与总体平方和的关系图，聚类数 k 应选择使总体平方和趋缓时的值 k = 4
fviz_nbclust(df, kmeans, method = "wss")

# 根据聚类数量计算差距统计量
gap_stat = clusGap(df, FUN = kmeans, nstart = 25, K.max = 10, B = 50)

# 聚类数量与差距统计量的关系图，聚类数 k 应选择使差距统计量最大时的值 k = 4
fviz_gap_stat(gap_stat)

# 聚为 4 类的 kmeans 统计
km_df = kmeans(df, centers = 4, nstart = 25)

# 查看聚类结果
km_df

# 聚类图
fviz_cluster(km_df, data = df)

# 计算每一类当中各变量的均值
aggregate(USArrests, by = list(cluster = km_df$cluster), mean)

# 将聚类结果添加到原始数据当中
final_data = cbind(USArrests, Cluster = km_df$cluster)
final_data

################################################ spirals 数据集的聚类分析 ################################################
## 谱聚类法 specc()
rm(list = ls(all = TRUE))
graphics.off()

library(kernlab)

# 载入数据
data(spirals)
x = spirals
head(x)

# 数据集的散点图
plot(x, xlab = '', ylab = '', pch = 16, cex = 1.5)

# 聚成 2 类的谱聚类
sc.x = specc(x, centers = 2)

# 聚类结果
sc.x

# 类的中心
centers(sc.x)

# 每一类中数据点的数量
size(sc.x)

# 每一类的组内平方和
withinss(sc.x)

# 按照分类结果的散点图
plot(x, xlab = '', ylab = '', pch = 16, cex = 1.5, col = sc.x)