Chapter_11.R · 芝士酷

#################### Standardized Linear Combination ####################
rm(list = ls(all = TRUE))
graphics.off()

#### Fig 11.1
par(mfrow=c(3, 1))
set.seed(1963)
x <- rnorm(50)
y = 0.8 * x + rnorm(50, 0, 1/2)
a <- 0.5
plot(x, y, xlim = c(-4, 4), ylim = c(-5, 5), axes = FALSE, 
     xlab = '', ylab = '', cex = 2)
axis(1, at = -4:4)
axis(2, at = -5:5)
abline(0, a, col = 2, lwd = 2)
y.lm <- lm(y ~ x)
b <- coef(y.lm)[2]
abline(0, b, col = 3, lty = 2)
title(main = 'Direction in Data', cex.main = 2)

proj_x <- (x + a * y) / (1 + a^2)
plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, 
     xlab = '', ylab = '', cex = 2)
axis(1, at = -4:4)
axis(2, at = -1:1)
title(main = 'Projection', cex.main = 2)

var(y)
hat_y <- a * x
var(hat_y)
var(hat_y) / var(y)
plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, 
     xlab = '', ylab = '', type = 'n')
text(-2, 0.5, adj = 0, 'Explained Variance: 0.3119', cex = 2)
text(-2, 0, adj = 0, 'Total Variance: 1.1256', cex = 2)
text(-2, -0.5, adj = 0, 'Explained Percentage: 0.2771', cex = 2)

#### Fig 11.2
rm(list = ls(all = TRUE))
graphics.off()
par(mfrow=c(3, 1))
set.seed(1963)
x <- rnorm(50)
y = 0.8 * x + rnorm(50, 0, 1/2)
plot(x, y, xlim = c(-4, 4), ylim = c(-5, 5), axes = FALSE, 
     xlab = '', ylab = '', cex = 2)
axis(1, at = -4:4)
axis(2, at = -5:5)
y.lm <- lm(y ~ x)
summary(y.lm)
abline(y.lm, col=2, lwd=2)
title(main = 'Direction in Data', cex.main = 2)

a <- coef(y.lm)[2]
proj_x <- (x + a * y) / (1 + a^2)
plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, 
     xlab = '', ylab = '', cex = 2)
axis(1, at = -4:4)
axis(2, at = -1:1)
title(main = 'Projection', cex.main = 2)

var(y)
hat_y <- predict(y.lm)
var(hat_y)
var(hat_y) / var(y)
plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, 
     xlab = '', ylab = '', type = 'n')
text(-2, 0.5, adj = 0, 'Explained Variance: 0.8889', cex = 2)
text(-2, 0, adj = 0, 'Total Variance: 1.1256', cex = 2)
text(-2, -0.5, adj = 0, 'Explained Percentage: 0.7898', cex = 2)

#### Example 11.2
rm(list = ls(all = TRUE))
graphics.off()
library(mclust)
data(banknote)
str(banknote)
head(banknote)
X <- as.matrix(banknote[, 2:7])
bar_X <- apply(X, 2, mean)
round(bar_X, digits = 1)
n <- dim(X)[1]
S <- n * var(X) / (n-1)
round(S, digits = 3)
eigen_S <- eigen(S)
eigen_values <- eigen_S$values
round(eigen_values, digits = 3)
eigen_vectors <- as.matrix(eigen_S$vectors)
round(eigen_vectors, digits = 3)

## Fig 11.3
Y <- X %*% eigen_vectors
par(mfrow=c(2, 2))
plot(Y[1:100, 1], Y[1:100, 2], xlim = c(-52, -45), ylim = c(-50, -43), 
     xlab = 'PC1', ylab = 'PC2', col = 'red', cex = 1.5)
points(Y[101:200, 1], Y[101:200, 2], col = 'blue', cex = 1.5, pch = 3)
title(main = 'First vs. Second PC')

plot(Y[1:100, 2], Y[1:100, 3], xlim = c(-50, -43), ylim = c(-242, -238), 
     xlab = 'PC2', ylab = 'PC3', col = 'red', cex = 1.5)
points(Y[101:200, 2], Y[101:200, 3], col = 'blue', cex = 1.5, pch = 3)
title(main = 'Second vs. Third PC')

plot(Y[1:100, 1], Y[1:100, 3], xlim = c(-52, -45), ylim = c(-242, -238), 
     xlab = 'PC1', ylab = 'PC3', col = 'red', cex = 1.5)
points(Y[101:200, 1], Y[101:200, 3], col = 'blue', cex = 1.5, pch = 3)
title(main = 'First vs. Third PC')

plot(1:6, eigen_values, xlab = 'Index', ylab = 'Lambda', main = 'Eigenvalues of S', 
     cex = 1.5, pch = 16)

#### Example 11.3
X_new <- X
X_new[, 1:3] <- X[, 1:3] / 10
X_new[, 6] <- X[, 6] / 10
head(X)
head(X_new)

bar_X_new <- apply(X_new, 2, mean)
round(bar_X_new, digits = 1)
S_new <- n * var(X_new) / (n-1)
round(S_new, digits = 3)
eigen_S_new <- eigen(S_new)
eigen_values_new <- eigen_S_new$values
round(eigen_values_new, digits = 4)
eigen_vectors_new <- as.matrix(eigen_S_new$vectors)
round(eigen_vectors_new, digits = 4)

## Fig 11.4
Y_new <- X_new %*% eigen_vectors_new
par(mfrow=c(2, 2))
plot(Y_new[1:100, 1], Y_new[1:100, 2], xlim = c(7, 14), ylim = c(-11, -5), 
     xlab = 'PC1', ylab = 'PC2', col = 'red', cex = 1.5)
points(Y_new[101:200, 1], Y_new[101:200, 2], col = 'blue', cex = 1.5, pch = 3)
title(main = 'First vs. Second PC')

plot(Y_new[1:100, 2], Y_new[1:100, 3], xlim = c(-11, -5), ylim = c(-14.3, -13.8), 
     xlab = 'PC2', ylab = 'PC3', col = 'red', cex = 1.5)
points(Y_new[101:200, 2], Y_new[101:200, 3], col = 'blue', cex = 1.5, pch = 3)
title(main = 'Second vs. Third PC')

plot(Y_new[1:100, 1], Y_new[1:100, 3], xlim = c(7, 14), ylim = c(-14.3, -13.8), 
     xlab = 'PC1', ylab = 'PC3', col = 'red', cex = 1.5)
points(Y_new[101:200, 1], Y_new[101:200, 3], col = 'blue', cex = 1.5, pch = 3)
title(main = 'First vs. Third PC')

plot(1:6, eigen_values_new, xlab = 'Index', ylab = 'Lambda', 
     main = 'Eigenvalues of S', cex = 1.5, pch = 16)

#################### Interpretation of the PCs ####################
Centered_X <- scale(X, center = TRUE, scale = FALSE)
S_centered <- n * var(Centered_X) / (n-1)
eigen_S_centered <- eigen(S_centered)
lambda <- eigen_S_centered$values
round(lambda, digits = 3)
G_centered <- eigen_S_centered$vectors
round(G_centered, digits = 3)

round(lambda, digits = 3)
round(cumsum(lambda) / sum(lambda), digits = 3) # cumulative proportions

#### scree plot
graphics.off()
m <- dim(X)[2]
plot(1:m, lambda, xlab = 'Index', ylab = 'Lambda', 
     main = 'The scree plot', cex = 2, pch = 16)

mp <- lambda / sum(lambda)
plot(1:m, mp, xlab = 'Index', ylab = 'Variance Explained', main = 'Swiss Bank Notes', 
     cex = 2, pch = 16, ylim = c(0, 0.8))

#### The correlation of the original variable with the PCs
rm(list = ls(all = TRUE))
graphics.off()

# 载入数据
library(mclust)
data(banknote)
x  = banknote[, 2:7]
n  = nrow(x)

# 谱分解
e  = eigen(cov(x) * (n-1) / n)  # 协方差矩阵的谱分解
round(e$values, digits = 3) # 特征值
e1 = e$values / sum(e$values) # 各特征值的占比
round(e1, digits = 3)
e2 = cumsum(e$values) / sum(e$values) # 各特征值的累积占比
round(e2, digits = 3)
round(e$vectors, digits = 3) # 特征向量

m    = apply(as.matrix(x), 2, mean) # 样本均值向量
m
temp = as.matrix(x - matrix(m, n, ncol(x), byrow = T)) # 中心化数据
head(temp)
r0    = temp %*% e$vectors # 主成分得分值
round(head(r0), digits = 3)

# 观测数据在第一、第二、第三主成分得分值的散点图，碎石图
graphics.off()
par(mfrow = c(2, 2))
plot(r0[1:100, 1], r0[1:100, 2], xlim = c(-3, 4), ylim = c(-4, 4), 
     xlab = 'PC1', ylab = 'PC2', col = 'red', cex = 1.5)
points(r0[101:200, 1], r0[101:200, 2], col = 'blue', cex = 1.5, pch = 3)
title(main = 'First vs. Second PC')

plot(r0[1:100, 2], r0[1:100, 3], xlim = c(-4, 4), ylim = c(-2, 2), 
     xlab = 'PC2', ylab = 'PC3', col = 'red', cex = 1.5)
points(r0[101:200, 2], r0[101:200, 3], col = 'blue', cex = 1.5, pch = 3)
title(main = 'Second vs. Third PC')

plot(r0[1:100, 1], r0[1:100, 3], xlim = c(-3, 4), ylim = c(-2, 2), 
     xlab = 'PC1', ylab = 'PC3', col = 'red', cex = 1.5)
points(r0[101:200, 1], r0[101:200, 3], col = 'blue', cex = 1.5, pch = 3)
title(main = 'First vs. Third PC')

plot(1:6, e$values, xlab = 'Index', ylab = 'Lambda', 
     main = 'Eigenvalues of S', cex = 1.5, pch = 16)

# 主成分与初始变量间的相关系数
r    = cor(cbind(r0, x)) # 主成分与初始变量间的相关系数
round(r, digits = 3)

r1   = r[7:12, 1:2]      # 初始变量与前两个主成分的相关系数
round(r1, digits = 3)

# 初始变量与前两个主成分的相关系数的散点图
graphics.off()
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", 
     ylab = "Second PC", 
     main = "Swiss Bank Notes", cex.lab = 1.2, cex.axis = 1.2, 
     cex.main = 1.8, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("X1", "X2", "X3", "X4", "X5", "X6")
text(r1, label, cex = 1.2)

# 前两个主成分对各个初始变量的解释比例
srs = array(0, dim = 6)
for (i in 1:6) srs[i] = r1[i, 1]^2 + r1[i, 2]^2
round(cbind(r1, ss = srs), digits = 3)

#################### Principal Components as a Factorial Method ####################
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()

#### French food expenditure data set 法国食物消费数据集
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
library(readr)
x = read_csv("food.csv") # 读入数据
x
p  = ncol(x)
n  = nrow(x)
x  = x[, 2:p] # 剔除第一列 family
x

x1 = sqrt((n - 1) * apply(x, 2, var)/n) # 计算各变量的标准差
x1

x2 = x - matrix(apply(as.matrix(x), 2, mean), 
                nrow = n, 
                ncol = p - 1, 
                byrow = T) # 中心化变换
x2

x  = as.matrix(x2 / matrix(x1, 
                           nrow = n, 
                           ncol = p - 1, byrow = T)) # 标准化
round(x, digits = 3)

R  = t(x) %*% x / n # 计算相关矩阵
round(R, digits = 3)

e  = eigen(R) # 谱分解
e1 = e$values # 特征值
round(e1, digits = 3)

prop = e1 / sum(e1) # 规范化主成分的贡献率
round(prop, digits = 4)

cum_prop = cumsum(e1) / sum(e1) # 规范化主成分的累积贡献率
round(cum_prop, digits = 4)
# 前两个主成分的累积贡献率达到0.88以上，取前两个 NPC 即可

e2 = e$vectors # 特征向量
round(e2, digits = 3)

# 计算初始变量与前两个 NPC 的相关系数
a  = e2[, 1:2] # 前两个特征向量
w  = a * sqrt(matrix(e1[1:2], 
                     nrow(a), 
                     ncol(a), 
                     byrow = TRUE))
round(w, digits = 3)

# 各初始变量与前两个 NPC 相关系数的平方和
round(as.matrix(apply(w^2, 1, sum)), digits = 3)

#### 变量的可视化
z  = a * sqrt(matrix(e1[1:2], nrow(a), ncol(a), byrow = TRUE))

graphics.off()
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", 
     col = "blue", xlim = c(-1.05, 1.05), 
     ylim = c(-1.05, 1.05), 
     xlab = "First Factor - Goods", 
     ylab = "Second Factor - Goods", 
     main = "French Food data", 
     cex.lab = 1.2, cex.axis = 1.2, 
     cex.main = 1.8, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("bread", "vegetables", "fruits", 
          "meat", "poultry", "milk", "wine")
text(z, label)

#### 观测数据的可视化
graphics.off()
e  = eigen(x %*% t(x) / n)
e1 = e$values
e2 = e$vectors
a  = e2[, 1:2]
w  = -a * sqrt(matrix(e1[1:2], nrow(a), ncol(a), byrow = TRUE))

plot(w, type = "n", xlab = "First Factor - Families", ylab = "Second Factor - Families", 
     main = "French Food data", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2)
text(w, c("MA2", "EM2", "CA2", "MA3", "EM3", "CA3", "MA4", "EM4", "CA4", "MA5", "EM5", 
          "CA5"), cex = 1.2)
abline(h = 0, v = 0)
segments(w[1, 1], w[1, 2], w[2, 1], w[2, 2], lty = 3, lwd = 2)
segments(w[2, 1], w[2, 2], w[3, 1], w[3, 2], lty = 3, lwd = 2)
segments(w[1, 1], w[1, 2], w[4, 1], w[4, 2], lty = 3, lwd = 2, col = 'red')
segments(w[2, 1], w[2, 2], w[5, 1], w[5, 2], lty = 3, lwd = 2, col = 'green3')
segments(w[4, 1], w[4, 2], w[5, 1], w[5, 2], lty = 3, lwd = 2)
segments(w[5, 1], w[5, 2], w[6, 1], w[6, 2], lty = 3, lwd = 2)
segments(w[3, 1], w[3, 2], w[6, 1], w[6, 2], lty = 3, lwd = 2, col = 'orange')
segments(w[6, 1], w[6, 2], w[9, 1], w[9, 2], lty = 3, lwd = 2, col = 'orange')
segments(w[8, 1], w[8, 2], w[9, 1], w[9, 2], lty = 3, lwd = 2)
segments(w[5, 1], w[5, 2], w[8, 1], w[8, 2], lty = 3, lwd = 2, col = 'green3')
segments(w[7, 1], w[7, 2], w[8, 1], w[8, 2], lty = 3, lwd = 2)
segments(w[4, 1], w[4, 2], w[7, 1], w[7, 2], lty = 3, lwd = 2, col = 'red')
segments(w[7, 1], w[7, 2], w[10, 1], w[10, 2], lty = 3, lwd = 2, col = 'red')
segments(w[8, 1], w[8, 2], w[11, 1], w[11, 2], lty = 3, lwd = 2, col = 'green3')
segments(w[9, 1], w[9, 2], w[12, 1], w[12, 2], lty = 3, lwd = 2, col = 'orange')
segments(w[10, 1], w[10, 2], w[11, 1], w[11, 2], lty = 3, lwd = 2)
segments(w[11, 1], w[11, 2], w[12, 1], w[12, 2], lty = 3, lwd = 2)


######################################## NPCA of Boston Housing Data Set 波士顿房价数据集的主成分分析 ###################
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()
library(MASS)
data = Boston
head(data)

# transform data
xt       = data
xt[, 1]  = log(data[, 1])
xt[, 2]  = data[, 2]/10
xt[, 3]  = log(data[, 3])
xt[, 5]  = log(data[, 5])
xt[, 6]  = log(data[, 6])
xt[, 7]  = (data[, 7]^(2.5))/10000
xt[, 8]  = log(data[, 8])
xt[, 9]  = log(data[, 9])
xt[, 10] = log(data[, 10])
xt[, 11] = exp(0.4 * data[, 11])/1000
xt[, 12] = data[, 12]/100
xt[, 13] = sqrt(data[, 13])
xt[, 14] = log(data[, 14])
data     = xt[, -4]
head(data)

n1 = nrow(data)
n2 = ncol(data)
# standardizes the data 数据标准化
x = (data - matrix(apply(data, 2, mean), n1, n2, byrow = T)) / matrix(sqrt((n1 - 1) * 
                    apply(data, 2, var)/n1), n1, n2, byrow = T)
round(head(x), digits = 4)

# spectral decomposition 相关矩阵的谱分解
eig = eigen((n1 - 1) * cov(x)/n1)
e = eig$values # eigenvalues 特征值
perc = e/sum(e) # explained variance 各主成分的贡献率
cum = cumsum(e)/sum(e) # cumulative explained percentages 累积贡献率
evpc = data.frame(eigenvalues = e, percent = perc, cumpercent = cum)
round(evpc, digits = 4)

v = eig$vectors
round(v, digits = 4) # eigenvectors 特征向量

# principal components 主成分的取值，即数据在各主成分的投影值
xv   = as.matrix(x) %*% v
round(head(xv), digits = 4)

# correlations of the first 3 PC's with the original variables 原始变量与前三个主成分的相关系数
corr = cor(x, xv)[, 1:3]
colnames(corr) = c('PC1', 'PC2', 'PC3')
round(corr, digits = 4)
# proportions of variables explained by first 3 PCs 各变量由前三个主成分解释的比例
cumr2 = apply(corr^2, 1, sum)
round(as.matrix(cumr2), digits = 4)


graphics.off()
par(mfrow = c(2, 2))
# 变量在 PC1、PC2 方向的散点图
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Second PC",
     main = "Boston Housing", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")
text(corr[, 1], corr[, 2], label)

# 变量在 PC2、PC3 方向的散点图
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "Third PC", ylab = "Second PC", 
     main = "Boston Housing", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")
text(corr[, 3], corr[, 2], label)

# 变量在 PC1、PC3 方向的散点图
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", 
     xlab = "First PC", ylab = "Third PC", 
     main = "Boston Housing", cex.lab = 1.2, cex.axis = 1.2, 
     cex.main = 1.6, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")
text(corr[, 1], corr[, 3], label)

# 碎石图
plot(perc, xlab = 'PCs', ylab = 'Proportion of total variances explained by PCs', 
     main = 'Scree Plot', pch = 16,
     cex = 2, axes = FALSE, ylim = c(0, 0.6))
axis(1, at = 1:13)
axis(2, at = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6))
abline(h=0, lty=2)

# price x[, 13] greater or less than average, 15 is >= average, 17 is < average, stored in h1
h = x[, 13]
h[h >= mean(h)] = 15
h1   = h
h1[h1 < mean(x[, 13])] = 17

hr = h1
# mark more expensive houses with red square
hr[hr == 15] = "red"
# mark cheaper houses with black triangle
hr[hr == 17] = "black"                           
# correction for signs of eigenvectors
xv = xv * (-1)

# 观测值投影在前两个主成分上的散点图：红色矩形框代表高于平均房价的观测值，黑色三角形框代表低于平均房价的观测值
graphics.off()
plot(xv[, 1], xv[, 2], pch = h1, col = hr, xlab = "PC1", 
     ylab = "PC2", main = "First vs. Second PC", 
     cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6)
abline(h=0, v=0, lty=2, col = 'cyan2')

# houses close to the Charles River are indicated with red squares 
col = xt[, 4]
col[col == 1] = "red"
col[col == 0] = "black"

h2 = xt[, 4]
h2[h2 == 1] = 15
h2[h2 == 0] = 17
# 观测值投影在前两个主成分上的散点图：红色矩形框代表位于 Charles 河边的观测值，黑色三角形框代表远离 Charles 河的观测值
graphics.off()
plot(xv[, 1], xv[, 2], pch = h2, col = col, xlab = "PC1", ylab = "PC2", 
     main = "First vs. Second PC", 
     cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6)
abline(h=0, v=0, lty=2, col = 'cyan2')

######################################## NPCA of 79 US Companies 美国79家公司的标准化主成分分析 ####################
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
library(readr)
x = read_table("uscomp2.txt", col_name = FALSE)
colnames(x) = c("Company", "A", "S", "MV", "P", "CF", "E", "Sector")
attach(x)
Sector = as.character(Sector)
Sector[1:2]   = "H"
Sector[3:17]  = "E"
Sector[18:34] = "F"
Sector[35:42] = "H"
Sector[43:52] = "M"
Sector[53:63] = "*"
Sector[64:73] = "R"
Sector[74:79] = "*"
detach(x)
head(x)

# standardizes the data 数据标准化
x = as.matrix(x[, 2:7])
n = nrow(x)  # number of rows
x = (x - matrix(apply(x, 2, mean), n, 6, byrow = T)) / 
  matrix(sqrt((n - 1) * apply(x, 2, var)/n), n, 6, byrow = T)
round(head(x), digits = 4)

# spectral decomposition 相关矩阵的谱分解
eig = eigen((n - 1) * cov(x)/n)
e = eig$values
round(as.matrix(e), digits = 3)
v = eig$vectors
round(v, digits = 3)

# principal components 主成分取值
x = as.matrix(x) %*% v
graphics.off()
plot(cbind(-x[, 1], -x[, 2]), type = "n", xlab = "PC 1", ylab = "PC 2", main = "First vs. Second PC", 
     cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6)
text(cbind(-x[, 1], -x[, 2]), Sector)

# scree plot 碎石图
plot(e, xlab = "Index", ylab = "Lambda", main = "Eigenvalues of S", cex.lab = 1.2, 
     cex.axis = 1.2, cex.main = 1.6, pch = 16, cex = 1.5)
abline(h = 0, lty = 2)

#### NPCA of 79 US Companies without IBM and GE 不含 IBM 和 GE 的标准化主成分分析
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
library(readr)
x = read_table("uscomp2.txt", col_name = FALSE)
x = rbind(x[1:37, ], x[39, ], x[41:79, ])
colnames(x) = c("Company", "A", "S", "MV", "P", "CF", "E", "Sector")
attach(x)
Sector = as.character(Sector)
Sector[1:2]   = "H"
Sector[3:17]  = "E"
Sector[18:34] = "F"
Sector[35:40] = "H"
Sector[41:50] = "M"
Sector[51:61] = "*"
Sector[62:71] = "R"
Sector[72:77] = "*"
detach(x)

x = x[, 2:7]
n = nrow(x)
# standardizes the data 数据标准化
x = (x - matrix(apply(x, 2, mean), n, 6, byrow = T)) / 
  matrix(sqrt((n - 1) * apply(x, 2, var)/n), n, 6, byrow = T)
# spectral decomposition 谱分解
eig = eigen((n - 1) * cov(x)/n)
e = eig$values
round(as.matrix(e), digits = 3)
v = eig$vectors
round(v, digits = 3)

# principal components 主成分值
x = as.matrix(x) %*% v
plot(-x[, 1], x[, 2], xlim = c(-2, 8), ylim = c(-8, 8), type = "n", 
     xlab = "PC 1", ylab = "PC 2", 
     main = "First vs. Second PC")
text(-x[, 1], x[, 2], Sector)
abline(h = 0, v = 0, lty = 2, col = 'orange')

# scree plot 碎石图
plot(e, xlab = "Index", ylab = "Lambda", main = "Eigenvalues of S", 
     pch = 16, cex = 1.5)
abline(h = 0, lty = 2, col = 'orange')

#### Plot for the relative proportion of variance explained by PCs
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
library(readr)
x = read_table("uscomp2.txt", col_name = FALSE)
x = rbind(x[1:37, ], x[39, ], x[41:79, ])
x  = x[, 2:7]
n = nrow(x)
x = scale(x) # standardizes the data matrix
e = eigen((n - 1) * cov(x)/n) # calculates eigenvalues and eigenvectors
e1 = e$values
evprop = data.frame(eigenvalues = e1, proportion = e1/sum(e1), cumprop = cumsum(e1)/sum(e1))
round(evprop, digits = 3)

r = x %*% e$vectors # principal components
r = cor(cbind(r, x)) # correlation between PCs and variables
r1 = r[7:12, 1:2] # correlation of the two most important PCs and variables
corpcx = data.frame(PC1 = r1[, 1], PC2 = r1[, 2], SquaredSum = r1[, 1]^2 + r1[, 2]^2)
rownames(corpcx) = c('X1', 'X2', 'X3', 'X4', 'X5', 'X6')
round(corpcx, digits = 2)

ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", 
     xlab = "First PC", ylab = "Second PC", 
     main = "U.S. Company Data", cex.lab = 1.2, cex.axis = 1.2, 
     cex.main = 1.8, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("X1", "X2", "X3", "X4", "X5", "X6")
text(-r1[, 1], r1[, 2], label, cex = 1.2)

#################### NPCA of USJudgeRatings ##################################################################
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()
library(psych)
x = USJudgeRatings
head(x)
?USJudgeRatings
r = cor(x) # 计算相关矩阵
round(r, digits = 3)

NPCA.x = princomp(x, cor = TRUE, scores = TRUE, fix_sign = TRUE) # 标准化主成分分析

str(NPCA.x) # princomp 结果中的内容

round(NPCA.x$sdev, digits = 3) # 主成分标准差（方差的平方根）

round(NPCA.x$loadings, digits = 3) # 主成分载荷

round(NPCA.x$loadings[,1], digits = 3) # 第一主成分在原始变量的权重

round(NPCA.x$loadings[,2], digits = 3) # 第二主成分在原始变量的权重

screeplot(NPCA.x, main = 'Scree Plot of NPCA for USJudgeRatings') # 碎石图

round(NPCA.x$center, digits = 3) # 各个变量的均值（中心）

round(NPCA.x$scale, digits = 3) # 作用于各个变量的尺度

NPCA.x$n.obs # 观测值的个数

round(NPCA.x$scores, digits = 3) # 观测值在各主成分的投影值

scores_x = cbind(NPCA.x$scores, x) # 主成分值与原始数据值的合并数据矩阵
cor.NPC.x = cor(scores_x) # 主成分与原始变量的相关系数
cor.NPC.x.2 = cor.NPC.x[13:24, 1:2] # 原始变量与前两个主成分的相关系数
round(cor.NPC.x.2, digits = 3)

# 变量在前两个主方向的散点图
graphics.off()
ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi))
plot(ucircle, type = "l", lty = "solid", col = "blue", 
     xlab = "First PC", ylab = "Second PC", 
     main = "USJusgeRatings", cex.lab = 1.2, cex.axis = 1.2, 
     cex.main = 1.8, lwd = 2, asp = 1)
abline(h = 0, v = 0)
label = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12")
# label = row.names(cor.NPC.x.2)
text(-cor.NPC.x.2[, 1], cor.NPC.x.2[, 2], label, cex = 1.0)

# 观测值在前两个主成分的散点图
graphics.off()
plot(cbind(NPCA.x$scores[, 1], NPCA.x$scores[, 2]), type = "p", 
     xlab = "PC 1", ylab = "PC 2", pch = 16,
     main = "First NPC vs. Second NPC", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6)
abline(h = 0, v = 0, lty = 2)

graphics.off()
plot(cbind(NPCA.x$scores[, 1], NPCA.x$scores[, 2]), type = "n", xlab = "PC 1", ylab = "PC 2",
     main = "First NPC vs. Second NPC", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6)
abline(h = 0, v = 0, lty = 2)
label = 1:NPCA.x$n.obs
text(NPCA.x$scores[, 1], NPCA.x$scores[, 2], label)

# 前两个主成分得分按照方差的加权平均，对所有观测值(法官)排序
judge = (NPCA.x$sdev[1]^2 * NPCA.x$scores[, 1] + NPCA.x$sdev[2]^2 * NPCA.x$scores[, 2]) / 
  (NPCA.x$sdev[1]^2 + NPCA.x$sdev[2]^2)
rank = rank(judge) # 综合排序
rank
sort.judge = cbind(NPCA.x$scores[, 1], NPCA.x$scores[, 2], judge, rank)
sort.judge