############################################ Car Mark data set ##################################### # ----- clear variables and close windows ----- rm(list = ls(all = TRUE)) graphics.off() library(data.table) setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") Carmean = fread("Carmean2N.csv", header = TRUE) str(Carmean) head(Carmean) x = Carmean[, 2:9] x = as.matrix(x) head(x) nr_x = dim(x)[1] # 观测值个数(行数) nr_x nc_x = dim(x)[2] # 变量个数(列数) nc_x colmean_x = apply(x, 2, mean) # 计算每个变量的平均值 round(colmean_x, digits = 3) # 定义二元数据矩阵:大于各变量均值取 1,否则取 0 for (i in 1:nr_x) { for (j in 1:nc_x) { if (x[i, j] > colmean_x[j]) x[i, j] = 1 else x[i, j] = 0 } } x[17:19, ] # 以 17~19 这三行为例 ############################################ French Food data set ##################################### # clear variables and close windows rm(list = ls(all = TRUE)) graphics.off() library(data.table) x = fread("food.csv", header = TRUE) (x = x[, 2:7]) D_1 = dist(x, method = "euclidean", diag = TRUE, upper = FALSE, p = 2) round(D_1, digits = 2) sx = scale(x) D_2 = dist(sx, method = "euclidean", diag = TRUE, upper = FALSE, p = 2) round(D_2, digits = 2) ############################################ 8 points - k-means clustering ################################# # clear all variables rm(list = ls(all = TRUE)) # create matrix from 8 data points and define eight points eight = cbind(c(-3, -2, -2, -2, 1, 1, 2, 4), c(0, 4, -1, -2, 4, 2, -4, -3)) eight eight = eight[c(8, 7, 3, 1, 4, 2, 6, 5), ] # 数据集重新排序 eight # Plot 1: The 8-point example with k-means graphics.off() plot(eight, type = "n", xlab = "", ylab = "", xlim = c(-4, 4), ylim = c(-4, 4), main = "8 points", asp = 1) p12 = eight[1:2, ] p12.mean = apply(p12, 2, mean) segments(p12.mean[1], p12.mean[2], eight[1, 1], eight[1, 2], lwd = 2, col = 'orange') segments(p12.mean[1], p12.mean[2], eight[2, 1], eight[2, 2], lwd = 2, col = 'orange') p3_8 = eight[3:8, ] p3_8.mean = apply(p3_8, 2, mean) segments(p3_8.mean[1], p3_8.mean[2], eight[3, 1], eight[3, 2], lwd = 2, col = 'green3') segments(p3_8.mean[1], p3_8.mean[2], eight[4, 1], eight[4, 2], lwd = 2, col = 'green3') segments(p3_8.mean[1], p3_8.mean[2], eight[5, 1], eight[5, 2], lwd = 2, col = 'green3') segments(p3_8.mean[1], p3_8.mean[2], eight[6, 1], eight[6, 2], lwd = 2, col = 'green3') segments(p3_8.mean[1], p3_8.mean[2], eight[7, 1], eight[7, 2], lwd = 2, col = 'green3') segments(p3_8.mean[1], p3_8.mean[2], eight[8, 1], eight[8, 2], lwd = 2, col = 'green3') segments(p12.mean[1], p12.mean[2], p3_8.mean[1], p3_8.mean[2], lwd = 2, col = 'grey') points(p12.mean[1], p12.mean[2], pch = 16, cex = 1) points(p3_8.mean[1], p3_8.mean[2], pch = 16, cex = 1) points(eight, pch = 21, cex = 2.7, bg = "white") text(eight, as.character(1:8), col = "red3", cex = 1) # Plot 2: The 8-point example with k-medoids graphics.off() plot(eight, type = "n", xlab = "", ylab = "", xlim = c(-4, 4), ylim = c(-4, 4), main = "8 points", asp = 1) segments(eight[3, 1], eight[3, 2], eight[1, 1], eight[1, 2], lwd = 2, col = 'orange') segments(eight[3, 1], eight[3, 2], eight[2, 1], eight[2, 2], lwd = 2, col = 'orange') segments(eight[3, 1], eight[3, 2], eight[4, 1], eight[4, 2], lwd = 2, col = 'orange') segments(eight[3, 1], eight[3, 2], eight[5, 1], eight[5, 2], lwd = 2, col = 'orange') segments(eight[8, 1], eight[8, 2], eight[6, 1], eight[6, 2], lwd = 2, col = 'green3') segments(eight[8, 1], eight[8, 2], eight[7, 1], eight[7, 2], lwd = 2, col = 'green3') points(eight, pch = 21, cex = 2.7, bg = "white") text(eight, as.character(1:8), col = "red3", cex = 1) # Plot 3: The 8-point example with k-median graphics.off() plot(eight, type = "n", xlab = "", ylab = "", xlim = c(-4, 4), ylim = c(-4, 4), main = "8 points", asp = 1) p12 = eight[1:2, ] p12.median = apply(p12, 2, median) segments(p12.median[1], p12.median[2], eight[1, 1], eight[1, 2], lwd = 2, col = 'orange') segments(p12.median[1], p12.median[2], eight[2, 1], eight[2, 2], lwd = 2, col = 'orange') p3_8 = eight[3:8, ] p3_8.median = apply(p3_8, 2, median) segments(p3_8.median[1], p3_8.median[2], eight[3, 1], eight[3, 2], lwd = 2, col = 'green3') segments(p3_8.median[1], p3_8.median[2], eight[4, 1], eight[4, 2], lwd = 2, col = 'green3') segments(p3_8.median[1], p3_8.median[2], eight[5, 1], eight[5, 2], lwd = 2, col = 'green3') segments(p3_8.median[1], p3_8.median[2], eight[6, 1], eight[6, 2], lwd = 2, col = 'green3') segments(p3_8.median[1], p3_8.median[2], eight[7, 1], eight[7, 2], lwd = 2, col = 'green3') segments(p3_8.median[1], p3_8.median[2], eight[8, 1], eight[8, 2], lwd = 2, col = 'green3') segments(p12.median[1], p12.median[2], p3_8.median[1], p3_8.median[2], lwd = 2, col = 'grey') points(p12.median[1], p12.median[2], pch = 16, cex = 1) points(p3_8.median[1], p3_8.median[2], pch = 16, cex = 1) points(eight, pch = 21, cex = 2.7, bg = "white") text(eight, as.character(1:8), col = "red3", cex = 1) # Plot 4: The 8-point example —— dendrogram d = dist(eight, method = "euclidean", p = 2, diag = TRUE) # 欧氏距离矩阵 round(d, digits = 2) s1 = hclust(d, method = "single") # cluster analysis with single linkage algorithm 聚类分析-最短距离法 (single) graphics.off() plot(s1, hang = -0.1, frame.plot = TRUE, ann = FALSE) title(main = "Single Linkage Dendrogram - 8 points", ylab = "Euclidean Distance") abline(h = 3.5, lty = 2, col = 'cyan3') # cut the tree at distance 3.5 dd = d^2 # 欧氏距离平方的矩阵 round(dd, digits = 2) s2 = hclust(dd, method = "single") # cluster analysis with single linkage algorithm 聚类分析-最短距离法 (single) # The dendrogram for the 8-point example, Single linkage algorithm. graphics.off() plot(s2, hang = -0.1, frame.plot = TRUE, ann = FALSE) title(main = "Single Linkage Dendrogram - 8 points", ylab = "Squared Euclidean Distance") ############################################ Randomly select 20 observations from the bank notes data ####################### # clear all variables rm(list = ls(all = TRUE)) graphics.off() ## Load data library(mclust) data = banknote set.seed(3) xx = data[sample(1:nrow(data), 20, replace = F), ] # sample of 20 randomly chosen bank notes from data xx = xx[, 2:7] xx ## PCA 主成分分析 mean = as.vector(colMeans(xx)) # mean vector m = matrix(mean, nrow(xx), NROW(mean), byrow = T) x = xx - m # centering eig = eigen(cov(x)) # spectral decomposition eva = eig$values # eigenvalues eve = eig$vectors # eigenvectors xm = as.matrix(x) y = xm %*% eve # PC values ym = y[, 1:2] # first two PCs ## Plot 1: PCA graphics.off() plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "20 Swiss bank notes", ylim = c(-5, 4), xlim = c(-4, 4.5), asp = 1) text(ym[, 1], ym[, 2], rownames(xx), cex = 1.2) abline(h = 0, v = 0, lty = 2, col = 'orange') ## Plot 2: Dendrogram for the 20 bank notes after applying the Ward algorithm d = dist(xx, method = "euclidean", p = 2) # euclidean distance matrix dd = d^2 # squared euclidean distance matrix w = hclust(dd, method = "ward.D") # cluster analysis with ward algorithm graphics.off() plot(w, hang = -0.1, frame.plot = TRUE, ann = FALSE) title(main = "Dendrogram for 20 Swiss bank notes", ylab = "Squared Euclidean Distance", xlab = "Ward algorithm") ## Plot 3: PCA with clusters library(car) graphics.off() groups = cutree(w, h = 60) merg = matrix(c(ym, as.matrix(groups)), nrow = 20, ncol = 3) merg = merg[sort.list(merg[, 3]), ] merg1 = merg[1:6, 1:2] merg2 = merg[7:20, 1:2] plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "20 Swiss bank notes, cut height 60", ylim = c(-4, 4), xlim = c(-4, 4), asp = 1) abline(h = 0, v = 0, lty = 2, col = 'orange') text(ym[, 1], ym[, 2], rownames(xx), cex = 1.2) dataEllipse(x = merg1[, 1], y = merg1[, 2], center.pch = 0, col = "red", plot.points = F, add = T, levels = 0.85) dataEllipse(x = merg2[, 1], y = merg2[, 2], center.pch = 0, col = "blue", plot.points = F, add = T, levels = 0.95) ############################################ French food expenditure data set ############################################ # clear all variables rm(list = ls(all = TRUE)) graphics.off() ## Load data setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") fooddat = read_csv("food.csv") food = as.data.frame(fooddat[, -1]) # delete the first column (types of families) rownames(food) = c("MA2", "EM2", "CA2", "MA3", "EM3", "CA3", "MA4", "EM4", "CA4", "MA5", "EM5", "CA5") # define types of families f = scale(food) # standardize variables round(f, digits = 3) ## PCA mean = as.vector(colMeans(f)) m = matrix(mean, nrow(f), NROW(mean), byrow = T) x = f - m eig = eigen(cov(x)) # spectral decomposition eva = eig$values eve = eig$vectors xm = as.matrix(x) y = xm %*% eve ym = y[, 1:2] # first two eigenvectors ## Plot 1: PCA plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "French Food data", ylim = c(-5, 5), xlim = c(-5, 5), asp = 1) text(ym[, 1], ym[, 2], rownames(f), cex = 1.2) abline(h = 0, v = 0, lty = 2, col = 'orange') ## Plot 2: Dendrogram for the standardized food.dat after Ward algorithm d = dist(f, "euclidean", p = 2) # euclidean distance matrix dd = d^2 # squared euclidean distance matrix w = hclust(dd, method = "ward.D") # cluster analysis with ward algorithm graphics.off() plot(w, hang = -0.1, frame.plot = TRUE, ann = FALSE) title(main = "Ward Dendrogram for French Food", ylab = "Squared Euclidean Distance") ## Plot 3: PCA with clusters groups = cutree(w, h = 60) merg = matrix(c(ym, as.matrix(groups)), nrow = 12, ncol = 3) merg = merg[sort.list(merg[, 3]), ] merg1 = merg[1:7, 1:2] merg2 = merg[8:12, 1:2] graphics.off() plot(ym, type = "n", xlab = "first PC", ylab = "second PC", main = "French Food data, cut height 60", ylim = c(-5, 4), xlim = c(-5, 4.5), asp = 1) abline(h = 0, v = 0, lty = 2, col = 'orange') text(ym[, 1], ym[, 2], rownames(f), cex = 1.2) dataEllipse(x = merg1[, 1], y = merg1[, 2], center.pch = 0, col = "red", plot.points = F, add = T, levels = 0.8) dataEllipse(x = merg2[, 1], y = merg2[, 2], center.pch = 0, col = "blue", plot.points = F, add = T, levels = 0.65) ############################################ Adaptive Weights Clustering 自适应权重聚类 #################################### rm(list = ls(all = TRUE)) eight = cbind(c(-3, -2, -2, -2, 1, 1, 2, 4), c(0, 4, -1, -2, 4, 2, -4, -3)) eight eight = eight[c(8, 7, 3, 1, 4, 2, 6, 5), ] # 数据集重新排序 eight d = dist(eight, method = "euclidean", diag = TRUE, upper = TRUE, p = 2) # 计算欧氏距离 D = as.matrix(d) # 距离矩阵 nr = nrow(D) # 矩阵 D 的行数 nc = ncol(D) # 矩阵 D 的列数 W = matrix(0, nrow = nr, ncol = nc) # 权重矩阵的初始值 for (i in 1:nr){ for (j in 1:nc) { if (D[i, j] < 4) W[i, j] = 1 } } # 距离小于 4 的归为一类 round(D, digits = 2) # 距离矩阵 W # 权重矩阵 ################################################ Spectral Clustering 谱聚类 ################################################ rm(list = ls(all = TRUE)) library(kernlab) ?specc eight = cbind(c(-3, -2, -2, -2, 1, 1, 2, 4), c(0, 4, -1, -2, 4, 2, -4, -3)) eight eight = eight[c(8, 7, 3, 1, 4, 2, 6, 5), ] # 数据集重新排序 eight sc.eight = specc(eight, centers = 2) sc.eight centers(sc.eight) withinss(sc.eight) merg1 = eight[1:2, 1:2] merg2 = eight[3:8, 1:2] plot(eight, col = sc.eight, pch = 16, cex = 2, xlab = '', ylab = '', asp = 1, ylim = c(-6, 6)) dataEllipse(x = merg2[, 1], y = merg2[, 2], center.pch = 0, col = "blue", plot.points = F, add = T, levels = 0.75) ellipse(center = c(3, -3.5), shape = matrix(c(1, 0.6, 0.6, 1), nrow = 2), center.pch = 0, col = "red", radius = 1.2) ################################################ iris 数据集的聚类分析 ################################################ ## 系统聚类法 hclust() ?hclust rm(list = ls(all = TRUE)) x = iris head(x) d = dist(x[, 1:4]) # 距离矩阵 iris.hc1 = hclust(d, method = "complete") # 最长距离法 plot(iris.hc1, hang = -1) # 聚类树 re1 = rect.hclust(iris.hc1, k = 3) # 聚类树分成三类 re1 # 分成三类的结果 iris.id1 = cutree(iris.hc1, 3) # 编成三组 table(iris.id1, x$Species) # 分类结果对比 iris.hc2 = hclust(d, method = "centroid") # 重心法 plot(iris.hc2, hang = -1) # 聚类树 re2 = rect.hclust(iris.hc2, k = 3) # 聚类树分成三类 re2 # 分成三类的结果 iris.id2 = cutree(iris.hc2, 3) # 编成三组 table(iris.id2, x$Species) # 分类结果对比 iris.hc3 = hclust(d, method = "median") # 中位数法 plot(iris.hc3, hang = -1) # 聚类树 re3 = rect.hclust(iris.hc3, k = 3) # 聚类树分成三类 re3 # 分成三类的结果 iris.id3 = cutree(iris.hc3, 3) # 编成三组 table(iris.id3, x$Species) # 分类结果对比 ################################################ USArrests 数据集的聚类分析 ################################################ ## 动态聚类法 kmeans() rm(list = ls(all = TRUE)) graphics.off() library(factoextra) library(cluster) # 载入数据 df = USArrests # 移除有缺失值的数据,该数据集没有缺失数据 df = na.omit(df) # 标准化 df = scale(df) head(df) # 聚类数量与总体平方和的关系图,聚类数 k 应选择使总体平方和趋缓时的值 k = 4 fviz_nbclust(df, kmeans, method = "wss") # 根据聚类数量计算差距统计量 gap_stat = clusGap(df, FUN = kmeans, nstart = 25, K.max = 10, B = 50) # 聚类数量与差距统计量的关系图,聚类数 k 应选择使差距统计量最大时的值 k = 4 fviz_gap_stat(gap_stat) # 聚为 4 类的 kmeans 统计 km_df = kmeans(df, centers = 4, nstart = 25) # 查看聚类结果 km_df # 聚类图 fviz_cluster(km_df, data = df) # 计算每一类当中各变量的均值 aggregate(USArrests, by = list(cluster = km_df$cluster), mean) # 将聚类结果添加到原始数据当中 final_data = cbind(USArrests, Cluster = km_df$cluster) final_data ################################################ spirals 数据集的聚类分析 ################################################ ## 谱聚类法 specc() rm(list = ls(all = TRUE)) graphics.off() library(kernlab) # 载入数据 data(spirals) x = spirals head(x) # 数据集的散点图 plot(x, xlab = '', ylab = '', pch = 16, cex = 1.5) # 聚成 2 类的谱聚类 sc.x = specc(x, centers = 2) # 聚类结果 sc.x # 类的中心 centers(sc.x) # 每一类中数据点的数量 size(sc.x) # 每一类的组内平方和 withinss(sc.x) # 按照分类结果的散点图 plot(x, xlab = '', ylab = '', pch = 16, cex = 1.5, col = sc.x)