########################## Correspondence Analysis 对应分析 ########################## ## ---------------- French "baccalauréat" frequencies ---------------- rm(list = ls(all = TRUE)) graphics.off() library(data.table) setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") bac = fread("bac.csv", header = FALSE) # 读入数据 x = as.data.frame(bac) colnames(x) = c("Region", "A", "B", "C", "D", "E", "F", "G", "H") # 命名变量 x x.LORR = x[9, 2:9] # LORR 大区的数据 x.LORR ratio.LORR = 100 * x.LORR / (sum(x.LORR)) # 计算 LORR 大区报考方向的百分比 round(ratio.LORR, digits = 1) # 显示结果 ratio = 100 * apply(x[, 2:9], 2, sum) / sum(x[, 2:9]) # 计算各列的百分比 round(ratio, digits = 1) # 显示结果 ## ---------------- Chi-Squared Decomposition ---------------- rm(list = ls(all = TRUE)) x = data.frame(Frankfurt = c(4, 0, 1), Berlin = c(0, 1, 1), Munich = c(2, 1, 4)) row.names(x) = c("Finance", "Energy", "HiTech") x1 = cbind(x, RowSum = rowSums(x)) X = rbind(x1, RowSum = colSums(x1)) X t = 0 for (i in 1:3){ for (j in 1:3) t = t + (X[i, j] - X[i, 4] * X[4, j] / X[4, 4])^2 / (X[i, 4] * X[4, j] / X[4, 4]) } t # 检验统计量的值 qchisq(0.05, df = 4, lower.tail = FALSE) # 检验的临界值 pchisq(t, 4, lower.tail = FALSE) # 检验的 p 值 ## ---------------- Journaux Data 的对应分析 ---------------- rm(list = ls(all = TRUE)) graphics.off() x = fread("journaux.csv", header = FALSE) # 读入数据 x = as.data.frame(x) x a = rowSums(x) # 每一行之和构成的向量 a b = colSums(x) # 每一列之和构成的向量 b e = matrix(a) %*% b / sum(a) # 各单元 (交叉位置) 的期望值 (独立性假设下) round(e, digits = 2) cc = (x - e)/sqrt(e) # 计算拟进行奇异值分解的 Chi 矩阵 cc round(cc, digits = 2) ?svd sv = svd(cc) # 奇异值分解 g = sv$u round(g, digits = 2) l = sv$d round(l, digits = 2) d = sv$v round(d, digits = 2) ll = l * l # Chi 矩阵 cc 的特征值 round(ll, digits = 2) # cumulated percentage of the variance 方差的百分比与累积百分比 rat = ll / sum(ll) # 方差的百分比 aux = cumsum(ll) / sum(ll) # 方差的累积百分比 perc = cbind(Lambda = ll, Perc_Var = rat, Cum_Perc = aux) # 合并在一起 round(perc, digits = 3) # 计算行 (个体的) 权重 r1 = matrix(l, nrow = nrow(g), ncol = ncol(g), byrow = T) * g r = r1/matrix(sqrt(a), nrow = nrow(g), ncol = ncol(g), byrow = F) round(r, digits = 3) # 计算列 (变量的) 权重 s1 = matrix(l, nrow = nrow(d), ncol = ncol(d), byrow = T) * d s = s1/matrix(sqrt(b), nrow = nrow(d), ncol = ncol(d), byrow = F) round(s, digits = 3) # 计算行的绝对贡献 car = matrix(matrix(a), nrow = nrow(r), ncol = ncol(r), byrow = F) * r^2 / matrix(l^2, nrow = nrow(r), ncol = ncol(r), byrow = T) round(car, digits = 3) # 显示结果 # 计算列的绝对贡献 cas = matrix(matrix(b), nrow = nrow(s), ncol = ncol(s), byrow = F) * s^2 / matrix(l^2, nrow = nrow(s), ncol = ncol(s), byrow = T) round(cas, digits = 3) # 显示结果 # 分别取前两个因子 rr = r[, 1:2] round(rr, digits = 3) # 显示结果 ss = s[, 1:2] round(ss, digits = 3) # 显示结果 # labels for journals 报纸的标志 types = c("va", "vb", "vc", "vd", "ve", "ff", "fg", "fh", "fi", "bj", "bk", "bl", "vm", "fn", "fo") # labels for regions 地区的标志 regions = c("brw", "bxl", "anv", "brf", "foc", "for", "hai", "lig", "lim", "lux") # plot 绘图 plot(rr, type = "n", xlim = c(-1.1, 1.5), ylim = c(-1.1, 0.6), xlab = "r_1,s_1", ylab = "r_2,s_2", main = "Journal Data", cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6) points(ss, type = "n") text(rr, types, cex = 1.5, col = "blue") text(ss, regions, col = "red") abline(h = 0, v = 0, lwd = 1, lty = 2) ## ---------------- French baccalauréat Data 的对应分析 ---------------- # clear all variables rm(list = ls(all = TRUE)) graphics.off() # 读入数据 bac = fread("bac.csv", header = FALSE) # 读入数据 x1 = as.data.frame(bac) colnames(x1) = c("Region", "A", "B", "C", "D", "E", "F", "G", "H") # 命名变量 x1 x1 = x1[, 2:ncol(x1)] x1 # set to 0/1 to ex/include Corsica 设置 0/1 分别对应 不含/包含 Corsica 大区 wcors = 0 wcorsica = c(rep(1, nrow(x1) - 1), wcors) wcorsica x = subset(x1, wcorsica == 1) # 不含 Corsica 大区的子数据集 x a = rowSums(x) # 计算每一行之和 a b = colSums(x) # 计算每一列之和 b e = matrix(a) %*% b / sum(a) # 计算各单元 (交叉位置) 的期望值 (独立性假设下) round(e, digits = 2) # chi-matrix 计算拟进行奇异值分解的 Chi 矩阵 cc = (x - e) / sqrt(e) round(cc, digits = 2) # singular value decomposition 矩阵 cc 的奇异值分解 sv = svd(cc) g = sv$u round(g, digits = 2) l = sv$d round(l, digits = 2) d = sv$v round(d, digits = 2) # eigenvalues 特征值 ll = l * l round(ll, digits = 2) # cumulated percentage of the variance 方差的百分比与累积百分比 rat = ll / sum(ll) # 方差的百分比 aux = cumsum(ll) / sum(ll) # 方差的累积百分比 perc = cbind(Lambda = ll, Perc_Var = rat, Cum_Perc = aux) # 合并在一起 round(perc, digits = 3) # 计算行 (个体的) 权重 r1 = matrix(l, nrow = nrow(g), ncol = ncol(g), byrow = T) * g r = (r1/matrix(sqrt(a), nrow = nrow(g), ncol = ncol(g), byrow = F)) * (-1) round(r, digits = 3) # 计算列 (变量的) 权重 s1 = matrix(l, nrow = nrow(d), ncol = ncol(d), byrow = T) * d s = (s1/matrix(sqrt(b), nrow = nrow(d), ncol = ncol(d), byrow = F)) * (-1) round(s, digits = 3) # 计算行的绝对贡献 car = matrix(matrix(a), nrow = nrow(r), ncol = ncol(r), byrow = F) * r^2/matrix(l^2, nrow = nrow(r), ncol = ncol(r), byrow = T) round(car, digits = 3) # 显示结果 # 计算列的绝对贡献 cas = matrix(matrix(b), nrow = nrow(s), ncol = ncol(s), byrow = F) * s^2/matrix(l^2, nrow = nrow(s), ncol = ncol(s), byrow = T) round(cas, digits = 3) # 显示结果 # 分别取前两个因子 rr = r[, 1:2] round(rr, digits = 3) # 显示结果 ss = s[, 1:2] round(ss, digits = 3) # 显示结果 if (wcors == 0) { # labels for modalities types = c("A", "B", "C", "D", "E", "F", "G", "H") # labels for regions regions = c("ildf", "cham", "pica", "hnor", "cent", "bnor", "bour", "nopc", "lorr", "alsa", "frac", "payl", "bret", "pcha", "aqui", "midi", "limo", "rhoa", "auve", "laro", "prov") # plot 1 plot(rr, type = "n", xlim = c(-0.25, 0.15), ylim = c(-0.15, 0.15), xlab = "r_1,s_1", ylab = "r_2,s_2", main = "Baccalaureat Data", cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6) points(ss, type = "n") text(rr, regions, col = "blue") text(ss, types, cex = 1.5, col = "red") abline(h = 0, v = 0, lwd = 2) } else { # labels for modalities types = c("A", "B", "C", "D", "E", "F", "G", "H") # labels for regions regions = c("ildf", "cham", "pica", "hnor", "cent", "bnor", "bour", "nopc", "lorr", "alsa", "frac", "payl", "bret", "pcha", "aqui", "midi", "limo", "rhoa", "auve", "laro", "prov", "cors") # plot 2 plot(rr, type = "n", xlim = c(-0.2, 0.25), ylim = c(-0.5, 0.15), xlab = "r_1,s_1", ylab = "r_2,s_2", main = "Baccalaureat Data", cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6) points(ss, type = "n") text(rr, regions, col = "blue") text(ss, types, cex = 1.5, col = "red") abline(h = 0, v = 0, lwd = 2) } ## ---------------- MASS 包中 caith 数据集的对应分析 ---------------- rm(list = ls(all = TRUE)) graphics.off() library(MASS) ## 读入数据: Caithness 地区人群的头发与眼睛颜色的调查数据, ## 行代表眼睛的颜色,列是头发的颜色 x = caith x ## 检验行与列变量的独立性:p-value < 2.2e-16,远小于0.05, ## 说明行变量 (眼睛颜色) 和列变量 (头发颜色) 不独立,可做对应分析 ?chisq.test chisq.test(x) ## 利用 MASS 包中的 corresp() 函数作对应分析 x.corres = corresp(x, nf = 2) x.corres # 显示结果 ## 用 biplot() 函数绘制对应分析的散点图 biplot(x.corres) abline(h = 0, v = 0, lty = 2) ## 可以分为三类: ## 1. 眼睛颜色为 dark 对应头发颜色为 black 和 dark. ## 2. 眼睛颜色为 blue 对应头发颜色为 fair. ## 3. 眼睛颜色为 medium 对应头发颜色为 red 和 medium. ## ---------------- 收入与品牌选择数据的对应分析 ---------------- rm(list = ls(all = TRUE)) graphics.off() library(ca) ## 读入数据 brand = data.frame(Low = c(2, 49, 4, 4, 15, 1), Medium = c(7, 7, 5, 49, 2, 7), High = c(6, 3, 23, 5, 5, 14)) row.names(brand) = c("A", "B", "C", "D", "E", "F") x = t(brand) x ## 检验行与列变量的独立性:p-value < 2.2e-16,远小于0.05, ## 说明行变量 (收入) 和列变量 (品牌选择) 不独立,可做对应分析 chisq.test(x) ## 利用 ca 包中的 ca() 函数作对应分析 x.ca = ca(x) x.ca # 显示结果 ## 前两个因子的坐标 x.ca$rowcoord # 行坐标 x.ca$colcoord # 列坐标 ## 绘制对应分析的散点图 plot(x.ca) ## 可以分为三类: ## 1. 低收入人群倾向于选择品牌 B 和 E. ## 2. 中等收入群人倾向于选择品牌 D. ## 3. 高收入群人倾向于选择品牌 A, C 和 F.