Chapter_15.R · 芝士酷

########################## Correspondence Analysis 对应分析 ##########################

## ---------------- French "baccalauréat" frequencies ----------------
rm(list = ls(all = TRUE))
graphics.off()
library(data.table)
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
bac = fread("bac.csv", header = FALSE) # 读入数据
x = as.data.frame(bac)
colnames(x) = c("Region", "A", "B", "C", "D", "E", "F", "G", "H") # 命名变量
x

x.LORR = x[9, 2:9] # LORR 大区的数据
x.LORR
ratio.LORR = 100 * x.LORR / (sum(x.LORR)) # 计算 LORR 大区报考方向的百分比
round(ratio.LORR, digits = 1) # 显示结果

ratio = 100 * apply(x[, 2:9], 2, sum) / sum(x[, 2:9]) # 计算各列的百分比
round(ratio, digits = 1) # 显示结果

## ---------------- Chi-Squared Decomposition ----------------
rm(list = ls(all = TRUE))
x = data.frame(Frankfurt = c(4, 0, 1), Berlin = c(0, 1, 1), 
               Munich = c(2, 1, 4))
row.names(x) = c("Finance", "Energy", "HiTech")
x1 = cbind(x, RowSum = rowSums(x))
X = rbind(x1, RowSum = colSums(x1))
X

t = 0
for (i in 1:3){
  for (j in 1:3) t = t + (X[i, j] - X[i, 4] * X[4, j] / 
                            X[4, 4])^2 / (X[i, 4] * X[4, j] / X[4, 4])
}
t # 检验统计量的值

qchisq(0.05, df = 4, lower.tail = FALSE) # 检验的临界值
pchisq(t, 4, lower.tail = FALSE) # 检验的 p 值

## ---------------- Journaux Data 的对应分析 ----------------
rm(list = ls(all = TRUE))
graphics.off()
x = fread("journaux.csv", header  = FALSE) # 读入数据
x = as.data.frame(x)
x

a  = rowSums(x) # 每一行之和构成的向量
a
b  = colSums(x) # 每一列之和构成的向量
b
e  = matrix(a) %*% b / sum(a) # 各单元 (交叉位置) 的期望值 (独立性假设下)
round(e, digits = 2)

cc = (x - e)/sqrt(e) # 计算拟进行奇异值分解的 Chi 矩阵 cc
round(cc, digits = 2)

?svd
sv = svd(cc) # 奇异值分解
g = sv$u
round(g, digits = 2)

l = sv$d
round(l, digits = 2)

d = sv$v
round(d, digits = 2)

ll = l * l # Chi 矩阵 cc 的特征值
round(ll, digits = 2)

# cumulated percentage of the variance 方差的百分比与累积百分比
rat = ll / sum(ll) # 方差的百分比
aux  = cumsum(ll) / sum(ll) # 方差的累积百分比
perc = cbind(Lambda = ll, Perc_Var = rat, Cum_Perc = aux) # 合并在一起
round(perc, digits = 3)

# 计算行 (个体的) 权重
r1 = matrix(l, nrow = nrow(g), ncol = ncol(g), byrow = T) * g
r = r1/matrix(sqrt(a), nrow = nrow(g), ncol = ncol(g), byrow = F) 
round(r, digits = 3)

# 计算列 (变量的) 权重
s1 = matrix(l, nrow = nrow(d), ncol = ncol(d), byrow = T) * d
s = s1/matrix(sqrt(b), nrow = nrow(d), ncol = ncol(d), byrow = F) 
round(s, digits = 3)

# 计算行的绝对贡献
car = matrix(matrix(a), nrow = nrow(r), ncol = ncol(r), byrow = F) * 
  r^2 / matrix(l^2, nrow = nrow(r), ncol = ncol(r), byrow = T)
round(car, digits = 3) # 显示结果

# 计算列的绝对贡献
cas = matrix(matrix(b), nrow = nrow(s), ncol = ncol(s), byrow = F) * 
  s^2 / matrix(l^2, nrow = nrow(s), ncol = ncol(s), byrow = T)
round(cas, digits = 3) # 显示结果

# 分别取前两个因子
rr = r[, 1:2]
round(rr, digits = 3) # 显示结果
ss = s[, 1:2]
round(ss, digits = 3) # 显示结果

# labels for journals 报纸的标志
types    = c("va", "vb", "vc", "vd", "ve", "ff", "fg", "fh", 
             "fi", "bj", "bk", "bl", "vm", "fn", "fo")

# labels for regions 地区的标志
regions  = c("brw", "bxl", "anv", "brf", "foc", "for", "hai", 
             "lig", "lim", "lux")

# plot 绘图
plot(rr, type = "n", xlim = c(-1.1, 1.5), ylim = c(-1.1, 0.6), 
     xlab = "r_1,s_1", 
     ylab = "r_2,s_2", main = "Journal Data", 
     cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6)
points(ss, type = "n")
text(rr, types, cex = 1.5, col = "blue")
text(ss, regions, col = "red")
abline(h = 0, v = 0, lwd = 1, lty = 2)


## ---------------- French baccalauréat Data 的对应分析 ----------------
# clear all variables
rm(list = ls(all = TRUE))
graphics.off()

# 读入数据
bac = fread("bac.csv", header = FALSE) # 读入数据
x1 = as.data.frame(bac)
colnames(x1) = c("Region", "A", "B", "C", "D", "E", "F", "G", "H") # 命名变量
x1
x1 = x1[, 2:ncol(x1)]
x1

# set to 0/1 to ex/include Corsica 设置 0/1 分别对应 不含/包含 Corsica 大区
wcors = 0
wcorsica = c(rep(1, nrow(x1) - 1), wcors)
wcorsica

x = subset(x1, wcorsica == 1) # 不含 Corsica 大区的子数据集
x
a = rowSums(x) # 计算每一行之和
a
b = colSums(x) # 计算每一列之和
b
e = matrix(a) %*% b / sum(a) # 计算各单元 (交叉位置) 的期望值 (独立性假设下)
round(e, digits = 2)

# chi-matrix 计算拟进行奇异值分解的 Chi 矩阵
cc = (x - e) / sqrt(e)
round(cc, digits = 2)

# singular value decomposition 矩阵 cc 的奇异值分解
sv = svd(cc)
g = sv$u
round(g, digits = 2)

l = sv$d
round(l, digits = 2)

d = sv$v
round(d, digits = 2)

# eigenvalues 特征值
ll   = l * l
round(ll, digits = 2)

# cumulated percentage of the variance 方差的百分比与累积百分比
rat = ll / sum(ll) # 方差的百分比
aux  = cumsum(ll) / sum(ll) # 方差的累积百分比
perc = cbind(Lambda = ll, Perc_Var = rat, Cum_Perc = aux) # 合并在一起
round(perc, digits = 3)

# 计算行 (个体的) 权重
r1 = matrix(l, nrow = nrow(g), ncol = ncol(g), byrow = T) * g
r = (r1/matrix(sqrt(a), nrow = nrow(g), ncol = ncol(g), byrow = F)) * (-1)
round(r, digits = 3)


# 计算列 (变量的) 权重
s1 = matrix(l, nrow = nrow(d), ncol = ncol(d), byrow = T) * d
s = (s1/matrix(sqrt(b), nrow = nrow(d), 
               ncol = ncol(d), byrow = F)) * (-1)
round(s, digits = 3)

# 计算行的绝对贡献
car  = matrix(matrix(a), nrow = nrow(r), ncol = ncol(r), 
              byrow = F) * r^2/matrix(l^2, nrow = nrow(r), 
                                      ncol = ncol(r), byrow = T)
round(car, digits = 3) # 显示结果

# 计算列的绝对贡献
cas  = matrix(matrix(b), nrow = nrow(s), ncol = ncol(s), 
              byrow = F) * s^2/matrix(l^2, nrow = nrow(s), 
                                      ncol = ncol(s), byrow = T)
round(cas, digits = 3) # 显示结果

# 分别取前两个因子
rr = r[, 1:2]
round(rr, digits = 3) # 显示结果
ss = s[, 1:2]
round(ss, digits = 3) # 显示结果

if (wcors == 0) {
  # labels for modalities
  types   = c("A", "B", "C", "D", "E", "F", "G", "H")
  
  # labels for regions
  regions = c("ildf", "cham", "pica", "hnor", "cent", "bnor", "bour", "nopc", 
              "lorr", "alsa", "frac", "payl", "bret", "pcha", "aqui", "midi", 
              "limo", "rhoa", "auve", "laro", "prov")
  
  # plot 1
  plot(rr, type = "n", xlim = c(-0.25, 0.15), ylim = c(-0.15, 0.15), xlab = "r_1,s_1", 
       ylab = "r_2,s_2", main = "Baccalaureat Data", cex.axis = 1.2, cex.lab = 1.2, 
       cex.main = 1.6)
  points(ss, type = "n")
  text(rr, regions, col = "blue")
  text(ss, types, cex = 1.5, col = "red")
  abline(h = 0, v = 0, lwd = 2)
} else {
  # labels for modalities
  types   = c("A", "B", "C", "D", "E", "F", "G", "H")
  
  # labels for regions
  regions = c("ildf", "cham", "pica", "hnor", "cent", "bnor", "bour", "nopc", 
              "lorr", "alsa", "frac", "payl", "bret", "pcha", "aqui", "midi", "limo", "rhoa", 
              "auve", "laro", "prov", "cors")
  
  # plot 2
  plot(rr, type = "n", xlim = c(-0.2, 0.25), ylim = c(-0.5, 0.15), xlab = "r_1,s_1", 
       ylab = "r_2,s_2", main = "Baccalaureat Data", cex.axis = 1.2, cex.lab = 1.2, 
       cex.main = 1.6)
  points(ss, type = "n")
  text(rr, regions, col = "blue")
  text(ss, types, cex = 1.5, col = "red")
  abline(h = 0, v = 0, lwd = 2)
}

## ---------------- MASS 包中 caith 数据集的对应分析 ----------------
rm(list = ls(all = TRUE))
graphics.off()
library(MASS)

## 读入数据: Caithness 地区人群的头发与眼睛颜色的调查数据，
## 行代表眼睛的颜色，列是头发的颜色
x = caith
x

## 检验行与列变量的独立性：p-value < 2.2e-16，远小于0.05，
## 说明行变量 (眼睛颜色) 和列变量 (头发颜色) 不独立，可做对应分析
?chisq.test
chisq.test(x)

## 利用 MASS 包中的 corresp() 函数作对应分析
x.corres = corresp(x, nf = 2)
x.corres # 显示结果

## 用 biplot() 函数绘制对应分析的散点图
biplot(x.corres)
abline(h = 0, v = 0, lty = 2)

## 可以分为三类：
## 1. 眼睛颜色为 dark 对应头发颜色为 black 和 dark.
## 2. 眼睛颜色为 blue 对应头发颜色为 fair.
## 3. 眼睛颜色为 medium 对应头发颜色为 red 和 medium.

## ---------------- 收入与品牌选择数据的对应分析 ----------------
rm(list = ls(all = TRUE))
graphics.off()
library(ca)

## 读入数据
brand = data.frame(Low = c(2, 49, 4, 4, 15, 1), 
                   Medium = c(7, 7, 5, 49, 2, 7), 
                   High = c(6, 3, 23, 5, 5, 14))
row.names(brand) = c("A", "B", "C", "D", "E", "F")
x = t(brand)
x

## 检验行与列变量的独立性：p-value < 2.2e-16，远小于0.05，
## 说明行变量 (收入) 和列变量 (品牌选择) 不独立，可做对应分析
chisq.test(x)

## 利用 ca 包中的 ca() 函数作对应分析
x.ca = ca(x)
x.ca # 显示结果

## 前两个因子的坐标
x.ca$rowcoord # 行坐标
x.ca$colcoord # 列坐标

## 绘制对应分析的散点图
plot(x.ca)

## 可以分为三类：
## 1. 低收入人群倾向于选择品牌 B 和 E.
## 2. 中等收入群人倾向于选择品牌 D.
## 3. 高收入群人倾向于选择品牌 A, C 和 F.