Chapter_12.R · 芝士酷

# clear variables and close windows
rm(list = ls(all = TRUE))
graphics.off()
options(digits = 3)

###################################### Car Marks 数据 ######################################
# load data 载入数据
load("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data/carmean2.rda")
(x = carmean2)

###################################### 先作主成分分析
NPCA.car = princomp(x, cor = TRUE, scores = TRUE, fix_sign = TRUE)
summary(NPCA.car)

# 主成分的标准差
NPCA.car$sdev

# 主成分在原变量的载荷（权重）
NPCA.car$loadings

# 碎石图
screeplot(NPCA.car, ylim = c(0, 6), main = 'Scree Plot of NPCA for Car Marks')

# 观测值在各主成分的投影
NPCA.car$scores

# 各变量与第一主成分的相关系数
cor.x_NPC1 = NPCA.car$sdev[1] * NPCA.car$loadings[, 1]
# 各变量与第二主成分的相关系数
cor.x_NPC2 = NPCA.car$sdev[2] * NPCA.car$loadings[, 2]
# 各变量方差由前两个主成分解释的比例
NPC2.prop = cor.x_NPC1^2 + cor.x_NPC2^2
cbind(NPC1 = cor.x_NPC1, NPC2 = cor.x_NPC2, Sum_Squares = NPC2.prop)

# 变量投影在前两个主成分上的散点图
graphics.off()
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Second PC", 
     main = "Carmeans", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2, asp = 1)
abline(h = 0, v = 0)
# label = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8")
label = names(cor.x_NPC1)
text(-cor.x_NPC1, cor.x_NPC2, label, cex = 1)

# 观测值在前两个主成分的散点图
graphics.off()
plot(cbind(NPCA.car$scores[, 1], NPCA.car$scores[, 2]), type = "n", xlab = "PC 1", ylab = "PC 2",
     main = "First NPC vs. Second NPC", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6)
abline(h = 0, v = 0, lty = 2)
label = 1:NPCA.car$n.obs
text(NPCA.car$scores[, 1], NPCA.car$scores[, 2], label)

###################################### 再来确定因子数
# correlation matrix 计算相关矩阵
(r = cor(x))

# 将相关矩阵主对角元素为零，记为 m
m = r
for (i in 1:ncol(r)) {
  m[i, i] = r[i, i] - 1
}
m

# 计算 psi_{jj}
psi = matrix(0, 8, 8)
for (i in 1:8) {
  psi[i, i] = 1 - max(abs(m[, i]))
}
psi

# 对 R - Psi 作谱分解 spectral decomposition 
eig = eigen(r - psi)

# 特征值: 前两个特征值为正，且与其它相比较大，所以取 k = 2 个因子即可
ee  = eig$values
ee
ee  = eig$values[1:2] # 取前两个大的特征值
ee

# 特征向量
vv  = eig$vectors
vv
vv  = eig$vectors[, 1:2] # 前两个特征向量
vv  = t(t(vv[, 1:2]) * sign(vv[2, 1:2])) # 调整特征向量的符号
q1  = sqrt(ee[1]) * vv[, 1] # 变量在第一个主方向上的投影
q2  = sqrt(ee[2]) * vv[, 2] # 变量在第二个主方向上的投影
(q   = cbind(q1, q2))

# plot 变量在前两个因子上的投影散点图
plot(q, type = "n", xlab = "First Factor", ylab = "Second Factor", main = "Car Marks Data", 
     xlim = c(-1.2, 1.2), ylim = c(-0.4, 0.9), cex.lab = 1.4, cex.axis = 1.4, cex.main = 1.8)
text(q, colnames(x), cex = 1.2, xpd = NA)
abline(v = 0)
abline(h = 0)


###################################### Consumer-Preference Study ######################################
rm(list = ls(all = TRUE))
graphics.off()

x = c(1.00, 0.02, 0.96, 0.42, 0.01, 0.02, 1.00, 0.13, 0.71, 0.85,
      0.96, 0.13, 1.00, 0.50, 0.11, 0.42, 0.71, 0.50, 1.00, 0.79,
      0.01, 0.85, 0.11, 0.79, 1.00)
R = matrix(x, nrow = 5, byrow = TRUE)
R

R.eig = eigen(R) # 谱分解
R.eig$values # 特征值
sum(R.eig$values[1:2]) / 5 # 两个公共因子的累积方差占总方差的比例

# 公共因子的载荷 (k=2)
load.F = R.eig$vectors %*% diag(sqrt(R.eig$values))
round(load.F[, 1:2], digits = 2)

# 公共因子方差 (k=2)
h = load.F^2
h = h[, 1] + h[, 2]
round(h, digits = 2)

# 特殊因子方差 (k=2)
Psi = 1 - h
round(Psi, digits = 2)

# 放在一个数据框中
FA = cbind(Load_F1 = load.F[, 1], Load_F2 = load.F[, 2], Communalities = h, S_Variances = Psi)
round(FA, digits = 2)

# 验证估计的效果
load.F[, 1:2] %*% t(load.F[, 1:2]) + diag(Psi)


###################################### Boston Housing 数据集的因子分析 ######################################
# clear variables and close windows
rm(list = ls(all = TRUE))
library(MASS)
data = Boston
head(data)

# transform data 变量数据的变换
xt = data
xt[, 1]  = log(data[, 1])
xt[, 2]  = data[, 2]/10
xt[, 3]  = log(data[, 3])
xt[, 5]  = log(data[, 5])
xt[, 6]  = log(data[, 6])
xt[, 7]  = (data[, 7]^(2.5))/10000
xt[, 8]  = log(data[, 8])
xt[, 9]  = log(data[, 9])
xt[, 10] = log(data[, 10])
xt[, 11] = exp(0.4 * data[, 11])/1000
xt[, 12] = data[, 12]/100
xt[, 13] = sqrt(data[, 13])
xt[, 14] = log(data[, 14])
data = xt[, -4] # 剔除变量 X_4
round(head(data), digits = 4)

# rename variables 重新命名变量
colnames(data) = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")
round(head(data), digits = 4)

# standardize variables 变量标准化
da  = scale(data)
round(head(da), digits = 4)

# correlation matrix 计算相关矩阵
dat = cor(da)
round(dat, digits = 4)

# R 中的因子分析
?factanal

# Maximum Likelihood Factor Analysis without varimax rotation: factanal 极大似然因子分析，未作 varimax 旋转
mlm  = factanal(da, 3, rotation = "none", covmat = dat)
str(mlm)

# estimated factor loadings 因子载荷的估计值
load = mlm$loadings
load

# the estimated factor loadings matrix 因子载荷矩阵
ld = cbind(load[, 1], load[, 2], load[, 3])
round(ld, digits = 4)

# communalities are calculated 计算公共因子方差
com = diag(ld %*% t(ld))
round(com, digits = 4)

# specific variances are calculated 计算特殊因子方差
psi  = diag(dat) - diag(ld %*% t(ld))
round(psi, digits = 4)

# 因子载荷、公共因子方差、特殊因子方差的矩阵
tbl  = cbind(L_F1 = load[, 1], L_F2 = load[, 2], L_F3 = load[, 3], com, psi)
round(tbl, digits = 4)

# plot first factor against second 变量在因子 1 和因子 2 平面上的散点图
graphics.off()
plot(load[, 1], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors21 - theta21", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1.0, 1.0), ylim = c(-0.6, 0.6), asp = 1)
text(load[, 1], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# plot first factor against third 变量在因子 1 和因子 3 平面上的散点图
graphics.off()
plot(load[, 1], load[, 3], type = "n", xlab = "x", ylab = "y", main = "Factors31 - theta31", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1.0, 1.0), ylim = c(-0.6, 0.6), asp = 1)
text(load[, 1], load[, 3], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# plot second factor against third 变量在因子 2 和因子 3 平面上的散点图
graphics.off()
plot(load[, 3], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors23 - theta23", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1.0, 1.0), 
     ylim = c(-0.6, 0.6), asp = 1)
text(load[, 3], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# 三张图放在一起
graphics.off()
par(mfcol = c(2, 2))
plot(load[, 1], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors21 - theta21", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1.0, 1.0), ylim = c(-0.6, 0.6), asp = 1)
text(load[, 1], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

plot(load[, 1], load[, 3], type = "n", xlab = "x", ylab = "y", main = "Factors31 - theta31", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1.0, 1.0), ylim = c(-0.6, 0.6), asp = 1)
text(load[, 1], load[, 3], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

plot(load[, 3], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors23 - theta23", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1.0, 1.0), 
     ylim = c(-0.6, 0.6), asp = 1)
text(load[, 3], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# R 中的因子旋转
?varimax

# Maximum Likelihood Factor Analysis after varimax rotation 极大似然方法的因子分析，因子旋转之后
# rotates the factor loadings matrix 因子载荷矩阵的旋转
var = varimax(ld)
str(var)

# estimated factor loadings after varimax 旋转后的因子载荷
load = var$loadings
round(load, digits = 4)

# 旋转后的因子载荷矩阵
vl   = cbind(load[, 1], load[, 2], load[, 3])
round(vl, digits = 4)

# 旋转矩阵
rm = var$rotmat
round(rm, digits = 4)

# communalities are calculated 计算公共因子方差
com  = diag(vl %*% t(vl))
round(com, digits = 4)

# specific variances are calculated 计算特殊因子方差
psi  = diag(dat) - diag(vl %*% t(vl))
round(psi, digits = 4)

# 旋转后的因子载荷、公共因子方差、特殊因子方差的矩阵
tbl  = cbind(RL_F1 = load[, 1], RL_F2 = load[, 2], RL_F3 = load[, 3], com, psi)
round(tbl, digits = 4)

graphics.off()
par(mfcol = c(2, 2))
# plot first factor against second 旋转后变量在第一、第二公共因子的散点图
plot(load[, 1], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors21 - theta21", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
text(load[, 1], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0)

# plot first factor against third 旋转后变量在第一、第三公共因子的散点图
plot(load[, 1], load[, 3], type = "n", xlab = "x", ylab = "y", main = "Factors31 - theta31", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
text(load[, 1], load[, 3], colnames(data), cex = 1.1)
abline(h = 0, v = 0)

# plot second factor against third 旋转后变量在第二、第三公共因子的散点图
plot(load[, 3], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors23 - theta23", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
text(load[, 3], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0)

#### Principal Component Method after varimax rotation spectral decomposition 主成分方法的因子分析，旋转后
# 相关矩阵的谱分解
e = eigen(dat)

# 特征值
eigval.all = e$values # 全部特征值
round(eigval.all, digits = 4)
eigval = e$values[1:3] # 前三个最大特征值
round(eigval, digits = 4)

# 特征向量
eigvec.all = e$vectors # 全部特征向量
round(eigvec.all, digits = 4)
eigvec = e$vectors[, 1:3] # 前三个特征向量
round(eigvec, digits = 4)

# the estimated factor loadings matrix 计算因子载荷矩阵的估计值
Q = eigvec %*% sqrt(diag(eigval))
round(Q, digits = 4)

# rotates the factor loadings matrix 因子载荷矩阵的旋转
pcm = varimax(Q)
load = pcm$loadings # estimated factor loadings after varimax 旋转后的因子载荷
ld     = cbind(load[, 1], load[, 2], load[, 3]) # 旋转后的因子载荷矩阵
round(ld, digits = 4)

rm = pcm$rotmat # 旋转矩阵
round(rm, digits = 4)

# communalities are calculated 计算公共因子方差
com = diag(ld %*% t(ld))
names(com) = row.names(dat)
round(com, digits = 4)

# specific variances are calculated 计算特殊因子方差
psi = diag(dat) - diag(ld %*% t(ld))
round(psi, digits = 4)

# 旋转后的因子载荷、公共因子方差、特殊因子方差的矩阵
tbl = cbind(LF1 = load[, 1], LF2 = load[, 2], LF3 = load[, 3], com, psi)
round(tbl, digits = 4)

graphics.off()
par(mfcol = c(2, 2))
# plot first factor against second 旋转后变量在第一、第二公共因子的散点图
plot(load[, 1], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors21 - theta21", xlim = c(-1, 1),
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, ylim = c(-1, 1), asp = 1)
text(load[, 1], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# plot first factor against third 旋转后变量在第一、第三公共因子的散点图
plot(load[, 1], load[, 3], type = "n", xlab = "x", ylab = "y", main = "Factors31 - theta31",  xlim = c(-1, 1),
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, ylim = c(-1, 1), asp = 1)
text(load[, 1], load[, 3], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# plot second factor against third 旋转后变量在第二、第三公共因子的散点图
plot(load[, 3], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors23 - theta23",  xlim = c(-1, 1),
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, ylim = c(-1, 1), asp = 1)
text(load[, 3], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

#### Principal Factor Method after varimax rotation inverse of the correlation matrix
#### 主因子方法的因子分析，旋转后，从相关矩阵的逆矩阵出发
f = solve(dat) # 相关矩阵的逆矩阵
# preliminary estimate of psi - (psi 的) 初始值
psiini = diag(1/f[row(f) == col(f)])
psi = psiini
round(psi, digits = 4)

# 迭代求解因子载荷矩阵
for (i in 1:10) {
  ee = eigen(dat - psi) # R - Psi 的谱分解
  eigval = ee$values[1:3] # 取前三个特征值
  eigvec = ee$vectors[, 1:3] # 取前三个特征向量
  EE = matrix(eigval, nrow(dat), ncol = 3, byrow = T)
  QQ = sqrt(EE) * eigvec # 计算载荷矩阵
  psiold = psi
  psi = diag(as.vector(1 - t(colSums(t(QQ * QQ))))) # 利用载荷矩阵计算 psi 的估计值
  i = i + 1
  z = psi - psiold
  convergence = z[row(z) == col(z)]
}
round(QQ, digits = 4) # 未旋转的因子载荷矩阵

# rotates the factor loadings matrix 因子载荷矩阵的旋转
pfm  = varimax(QQ)
# estimated factor loadings after varimax 旋转后的因子载荷矩阵
load = pfm$loadings
ld   = cbind(load[, 1], load[, 2], load[, 3])
round(ld, digits = 4)

# communalities are calculated 计算公共因子方差
com  = diag(ld %*% t(ld))
round(com, digits = 4)

# specific variances are calculated 计算特殊因子方差
psi  = diag(dat) - diag(ld %*% t(ld))
round(psi, digits = 4)

# 旋转后的因子载荷、公共因子方差、特殊因子方差的矩阵
tbl  = cbind(L_P1 = load[, 1], L_P2 = load[, 2], L_P3 = load[, 3], com, psi)
round(tbl, digits = 4)

graphics.off()
par(mfcol = c(2, 2))
# plot first factor against second 旋转后变量在第一、第二公共因子的散点图
plot(load[, 1], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors21 - theta21", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
text(load[, 1], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# plot first factor against third 旋转后变量在第一、第三公共因子的散点图
plot(load[, 1], load[, 3], type = "n", xlab = "x", ylab = "y", main = "Factors31 - theta31", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
text(load[, 1], load[, 3], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)

# plot second factor against third 旋转后变量在第二、第三公共因子的散点图
plot(load[, 3], load[, 2], type = "n", xlab = "x", ylab = "y", main = "Factors23 - theta24", 
     font.main = 1, cex.lab = 1.1, cex.axis = 1.1, cex.main = 1.4, xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
text(load[, 3], load[, 2], colnames(data), cex = 1.1)
abline(h = 0, v = 0, lty = 2)


######################################## Ability and Intelligence Tests 主成分分析
# clear variables and close windows
rm(list = ls(all = TRUE))
graphics.off()
?ability.cov
ability.cov
options(digits = 2)

# 利用函数 cov2cor 计算相关矩阵
x = ability.cov$cov # 读入协方差矩阵数据
cor_x = cov2cor(x) # 利用协方差矩阵计算相关矩阵
cor_x

library(psych)
?fa.parallel # fa.parallel 的帮助文件: 数据或相关矩阵的碎石图
graphics.off()
fa.parallel(cor_x, n.obs = 112, fa = "both", n.iter = 20, show.legend = FALSE, 
            main = "Scree plots with parallel analysis")

?fa # fa 函数的帮助文件：因子分析
fa_x = fa(cor_x, nfactors = 2, n.obs = 112, rotate = "none", fm = "pa", scores = "regression" )
fa_x

# 正交旋转
fa_x.varimax = fa(cor_x, nfactors = 2, n.obs = 112, rotate = "varimax", fm = "pa", scores = "regression" )
fa_x.varimax

# 正交旋转效果图
graphics.off()
par(mfrow = c(1, 2))
factor.plot(fa_x.varimax, labels = rownames(fa_x.varimax$loadings))
fa.diagram(fa_x.varimax, simple = TRUE)