#################### Standardized Linear Combination #################### rm(list = ls(all = TRUE)) graphics.off() #### Fig 11.1 par(mfrow=c(3, 1)) set.seed(1963) x <- rnorm(50) y = 0.8 * x + rnorm(50, 0, 1/2) a <- 0.5 plot(x, y, xlim = c(-4, 4), ylim = c(-5, 5), axes = FALSE, xlab = '', ylab = '', cex = 2) axis(1, at = -4:4) axis(2, at = -5:5) abline(0, a, col = 2, lwd = 2) y.lm <- lm(y ~ x) b <- coef(y.lm)[2] abline(0, b, col = 3, lty = 2) title(main = 'Direction in Data', cex.main = 2) proj_x <- (x + a * y) / (1 + a^2) plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, xlab = '', ylab = '', cex = 2) axis(1, at = -4:4) axis(2, at = -1:1) title(main = 'Projection', cex.main = 2) var(y) hat_y <- a * x var(hat_y) var(hat_y) / var(y) plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, xlab = '', ylab = '', type = 'n') text(-2, 0.5, adj = 0, 'Explained Variance: 0.3119', cex = 2) text(-2, 0, adj = 0, 'Total Variance: 1.1256', cex = 2) text(-2, -0.5, adj = 0, 'Explained Percentage: 0.2771', cex = 2) #### Fig 11.2 rm(list = ls(all = TRUE)) graphics.off() par(mfrow=c(3, 1)) set.seed(1963) x <- rnorm(50) y = 0.8 * x + rnorm(50, 0, 1/2) plot(x, y, xlim = c(-4, 4), ylim = c(-5, 5), axes = FALSE, xlab = '', ylab = '', cex = 2) axis(1, at = -4:4) axis(2, at = -5:5) y.lm <- lm(y ~ x) summary(y.lm) abline(y.lm, col=2, lwd=2) title(main = 'Direction in Data', cex.main = 2) a <- coef(y.lm)[2] proj_x <- (x + a * y) / (1 + a^2) plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, xlab = '', ylab = '', cex = 2) axis(1, at = -4:4) axis(2, at = -1:1) title(main = 'Projection', cex.main = 2) var(y) hat_y <- predict(y.lm) var(hat_y) var(hat_y) / var(y) plot(proj_x, rep(0, 50), xlim = c(-4, 4), ylim = c(-1, 1), axes = FALSE, xlab = '', ylab = '', type = 'n') text(-2, 0.5, adj = 0, 'Explained Variance: 0.8889', cex = 2) text(-2, 0, adj = 0, 'Total Variance: 1.1256', cex = 2) text(-2, -0.5, adj = 0, 'Explained Percentage: 0.7898', cex = 2) #### Example 11.2 rm(list = ls(all = TRUE)) graphics.off() library(mclust) data(banknote) str(banknote) head(banknote) X <- as.matrix(banknote[, 2:7]) bar_X <- apply(X, 2, mean) round(bar_X, digits = 1) n <- dim(X)[1] S <- n * var(X) / (n-1) round(S, digits = 3) eigen_S <- eigen(S) eigen_values <- eigen_S$values round(eigen_values, digits = 3) eigen_vectors <- as.matrix(eigen_S$vectors) round(eigen_vectors, digits = 3) ## Fig 11.3 Y <- X %*% eigen_vectors par(mfrow=c(2, 2)) plot(Y[1:100, 1], Y[1:100, 2], xlim = c(-52, -45), ylim = c(-50, -43), xlab = 'PC1', ylab = 'PC2', col = 'red', cex = 1.5) points(Y[101:200, 1], Y[101:200, 2], col = 'blue', cex = 1.5, pch = 3) title(main = 'First vs. Second PC') plot(Y[1:100, 2], Y[1:100, 3], xlim = c(-50, -43), ylim = c(-242, -238), xlab = 'PC2', ylab = 'PC3', col = 'red', cex = 1.5) points(Y[101:200, 2], Y[101:200, 3], col = 'blue', cex = 1.5, pch = 3) title(main = 'Second vs. Third PC') plot(Y[1:100, 1], Y[1:100, 3], xlim = c(-52, -45), ylim = c(-242, -238), xlab = 'PC1', ylab = 'PC3', col = 'red', cex = 1.5) points(Y[101:200, 1], Y[101:200, 3], col = 'blue', cex = 1.5, pch = 3) title(main = 'First vs. Third PC') plot(1:6, eigen_values, xlab = 'Index', ylab = 'Lambda', main = 'Eigenvalues of S', cex = 1.5, pch = 16) #### Example 11.3 X_new <- X X_new[, 1:3] <- X[, 1:3] / 10 X_new[, 6] <- X[, 6] / 10 head(X) head(X_new) bar_X_new <- apply(X_new, 2, mean) round(bar_X_new, digits = 1) S_new <- n * var(X_new) / (n-1) round(S_new, digits = 3) eigen_S_new <- eigen(S_new) eigen_values_new <- eigen_S_new$values round(eigen_values_new, digits = 4) eigen_vectors_new <- as.matrix(eigen_S_new$vectors) round(eigen_vectors_new, digits = 4) ## Fig 11.4 Y_new <- X_new %*% eigen_vectors_new par(mfrow=c(2, 2)) plot(Y_new[1:100, 1], Y_new[1:100, 2], xlim = c(7, 14), ylim = c(-11, -5), xlab = 'PC1', ylab = 'PC2', col = 'red', cex = 1.5) points(Y_new[101:200, 1], Y_new[101:200, 2], col = 'blue', cex = 1.5, pch = 3) title(main = 'First vs. Second PC') plot(Y_new[1:100, 2], Y_new[1:100, 3], xlim = c(-11, -5), ylim = c(-14.3, -13.8), xlab = 'PC2', ylab = 'PC3', col = 'red', cex = 1.5) points(Y_new[101:200, 2], Y_new[101:200, 3], col = 'blue', cex = 1.5, pch = 3) title(main = 'Second vs. Third PC') plot(Y_new[1:100, 1], Y_new[1:100, 3], xlim = c(7, 14), ylim = c(-14.3, -13.8), xlab = 'PC1', ylab = 'PC3', col = 'red', cex = 1.5) points(Y_new[101:200, 1], Y_new[101:200, 3], col = 'blue', cex = 1.5, pch = 3) title(main = 'First vs. Third PC') plot(1:6, eigen_values_new, xlab = 'Index', ylab = 'Lambda', main = 'Eigenvalues of S', cex = 1.5, pch = 16) #################### Interpretation of the PCs #################### Centered_X <- scale(X, center = TRUE, scale = FALSE) S_centered <- n * var(Centered_X) / (n-1) eigen_S_centered <- eigen(S_centered) lambda <- eigen_S_centered$values round(lambda, digits = 3) G_centered <- eigen_S_centered$vectors round(G_centered, digits = 3) round(lambda, digits = 3) round(cumsum(lambda) / sum(lambda), digits = 3) # cumulative proportions #### scree plot graphics.off() m <- dim(X)[2] plot(1:m, lambda, xlab = 'Index', ylab = 'Lambda', main = 'The scree plot', cex = 2, pch = 16) mp <- lambda / sum(lambda) plot(1:m, mp, xlab = 'Index', ylab = 'Variance Explained', main = 'Swiss Bank Notes', cex = 2, pch = 16, ylim = c(0, 0.8)) #### The correlation of the original variable with the PCs rm(list = ls(all = TRUE)) graphics.off() # 载入数据 library(mclust) data(banknote) x = banknote[, 2:7] n = nrow(x) # 谱分解 e = eigen(cov(x) * (n-1) / n) # 协方差矩阵的谱分解 round(e$values, digits = 3) # 特征值 e1 = e$values / sum(e$values) # 各特征值的占比 round(e1, digits = 3) e2 = cumsum(e$values) / sum(e$values) # 各特征值的累积占比 round(e2, digits = 3) round(e$vectors, digits = 3) # 特征向量 m = apply(as.matrix(x), 2, mean) # 样本均值向量 m temp = as.matrix(x - matrix(m, n, ncol(x), byrow = T)) # 中心化数据 head(temp) r0 = temp %*% e$vectors # 主成分得分值 round(head(r0), digits = 3) # 观测数据在第一、第二、第三主成分得分值的散点图,碎石图 graphics.off() par(mfrow = c(2, 2)) plot(r0[1:100, 1], r0[1:100, 2], xlim = c(-3, 4), ylim = c(-4, 4), xlab = 'PC1', ylab = 'PC2', col = 'red', cex = 1.5) points(r0[101:200, 1], r0[101:200, 2], col = 'blue', cex = 1.5, pch = 3) title(main = 'First vs. Second PC') plot(r0[1:100, 2], r0[1:100, 3], xlim = c(-4, 4), ylim = c(-2, 2), xlab = 'PC2', ylab = 'PC3', col = 'red', cex = 1.5) points(r0[101:200, 2], r0[101:200, 3], col = 'blue', cex = 1.5, pch = 3) title(main = 'Second vs. Third PC') plot(r0[1:100, 1], r0[1:100, 3], xlim = c(-3, 4), ylim = c(-2, 2), xlab = 'PC1', ylab = 'PC3', col = 'red', cex = 1.5) points(r0[101:200, 1], r0[101:200, 3], col = 'blue', cex = 1.5, pch = 3) title(main = 'First vs. Third PC') plot(1:6, e$values, xlab = 'Index', ylab = 'Lambda', main = 'Eigenvalues of S', cex = 1.5, pch = 16) # 主成分与初始变量间的相关系数 r = cor(cbind(r0, x)) # 主成分与初始变量间的相关系数 round(r, digits = 3) r1 = r[7:12, 1:2] # 初始变量与前两个主成分的相关系数 round(r1, digits = 3) # 初始变量与前两个主成分的相关系数的散点图 graphics.off() ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Second PC", main = "Swiss Bank Notes", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("X1", "X2", "X3", "X4", "X5", "X6") text(r1, label, cex = 1.2) # 前两个主成分对各个初始变量的解释比例 srs = array(0, dim = 6) for (i in 1:6) srs[i] = r1[i, 1]^2 + r1[i, 2]^2 round(cbind(r1, ss = srs), digits = 3) #################### Principal Components as a Factorial Method #################### rm(list = ls(all = TRUE)) # clear all variables graphics.off() #### French food expenditure data set 法国食物消费数据集 setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") library(readr) x = read_csv("food.csv") # 读入数据 x p = ncol(x) n = nrow(x) x = x[, 2:p] # 剔除第一列 family x x1 = sqrt((n - 1) * apply(x, 2, var)/n) # 计算各变量的标准差 x1 x2 = x - matrix(apply(as.matrix(x), 2, mean), nrow = n, ncol = p - 1, byrow = T) # 中心化变换 x2 x = as.matrix(x2 / matrix(x1, nrow = n, ncol = p - 1, byrow = T)) # 标准化 round(x, digits = 3) R = t(x) %*% x / n # 计算相关矩阵 round(R, digits = 3) e = eigen(R) # 谱分解 e1 = e$values # 特征值 round(e1, digits = 3) prop = e1 / sum(e1) # 规范化主成分的贡献率 round(prop, digits = 4) cum_prop = cumsum(e1) / sum(e1) # 规范化主成分的累积贡献率 round(cum_prop, digits = 4) # 前两个主成分的累积贡献率达到0.88以上,取前两个 NPC 即可 e2 = e$vectors # 特征向量 round(e2, digits = 3) # 计算初始变量与前两个 NPC 的相关系数 a = e2[, 1:2] # 前两个特征向量 w = a * sqrt(matrix(e1[1:2], nrow(a), ncol(a), byrow = TRUE)) round(w, digits = 3) # 各初始变量与前两个 NPC 相关系数的平方和 round(as.matrix(apply(w^2, 1, sum)), digits = 3) #### 变量的可视化 z = a * sqrt(matrix(e1[1:2], nrow(a), ncol(a), byrow = TRUE)) graphics.off() ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlim = c(-1.05, 1.05), ylim = c(-1.05, 1.05), xlab = "First Factor - Goods", ylab = "Second Factor - Goods", main = "French Food data", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("bread", "vegetables", "fruits", "meat", "poultry", "milk", "wine") text(z, label) #### 观测数据的可视化 graphics.off() e = eigen(x %*% t(x) / n) e1 = e$values e2 = e$vectors a = e2[, 1:2] w = -a * sqrt(matrix(e1[1:2], nrow(a), ncol(a), byrow = TRUE)) plot(w, type = "n", xlab = "First Factor - Families", ylab = "Second Factor - Families", main = "French Food data", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2) text(w, c("MA2", "EM2", "CA2", "MA3", "EM3", "CA3", "MA4", "EM4", "CA4", "MA5", "EM5", "CA5"), cex = 1.2) abline(h = 0, v = 0) segments(w[1, 1], w[1, 2], w[2, 1], w[2, 2], lty = 3, lwd = 2) segments(w[2, 1], w[2, 2], w[3, 1], w[3, 2], lty = 3, lwd = 2) segments(w[1, 1], w[1, 2], w[4, 1], w[4, 2], lty = 3, lwd = 2, col = 'red') segments(w[2, 1], w[2, 2], w[5, 1], w[5, 2], lty = 3, lwd = 2, col = 'green3') segments(w[4, 1], w[4, 2], w[5, 1], w[5, 2], lty = 3, lwd = 2) segments(w[5, 1], w[5, 2], w[6, 1], w[6, 2], lty = 3, lwd = 2) segments(w[3, 1], w[3, 2], w[6, 1], w[6, 2], lty = 3, lwd = 2, col = 'orange') segments(w[6, 1], w[6, 2], w[9, 1], w[9, 2], lty = 3, lwd = 2, col = 'orange') segments(w[8, 1], w[8, 2], w[9, 1], w[9, 2], lty = 3, lwd = 2) segments(w[5, 1], w[5, 2], w[8, 1], w[8, 2], lty = 3, lwd = 2, col = 'green3') segments(w[7, 1], w[7, 2], w[8, 1], w[8, 2], lty = 3, lwd = 2) segments(w[4, 1], w[4, 2], w[7, 1], w[7, 2], lty = 3, lwd = 2, col = 'red') segments(w[7, 1], w[7, 2], w[10, 1], w[10, 2], lty = 3, lwd = 2, col = 'red') segments(w[8, 1], w[8, 2], w[11, 1], w[11, 2], lty = 3, lwd = 2, col = 'green3') segments(w[9, 1], w[9, 2], w[12, 1], w[12, 2], lty = 3, lwd = 2, col = 'orange') segments(w[10, 1], w[10, 2], w[11, 1], w[11, 2], lty = 3, lwd = 2) segments(w[11, 1], w[11, 2], w[12, 1], w[12, 2], lty = 3, lwd = 2) ######################################## NPCA of Boston Housing Data Set 波士顿房价数据集的主成分分析 ################### rm(list = ls(all = TRUE)) # clear all variables graphics.off() library(MASS) data = Boston head(data) # transform data xt = data xt[, 1] = log(data[, 1]) xt[, 2] = data[, 2]/10 xt[, 3] = log(data[, 3]) xt[, 5] = log(data[, 5]) xt[, 6] = log(data[, 6]) xt[, 7] = (data[, 7]^(2.5))/10000 xt[, 8] = log(data[, 8]) xt[, 9] = log(data[, 9]) xt[, 10] = log(data[, 10]) xt[, 11] = exp(0.4 * data[, 11])/1000 xt[, 12] = data[, 12]/100 xt[, 13] = sqrt(data[, 13]) xt[, 14] = log(data[, 14]) data = xt[, -4] head(data) n1 = nrow(data) n2 = ncol(data) # standardizes the data 数据标准化 x = (data - matrix(apply(data, 2, mean), n1, n2, byrow = T)) / matrix(sqrt((n1 - 1) * apply(data, 2, var)/n1), n1, n2, byrow = T) round(head(x), digits = 4) # spectral decomposition 相关矩阵的谱分解 eig = eigen((n1 - 1) * cov(x)/n1) e = eig$values # eigenvalues 特征值 perc = e/sum(e) # explained variance 各主成分的贡献率 cum = cumsum(e)/sum(e) # cumulative explained percentages 累积贡献率 evpc = data.frame(eigenvalues = e, percent = perc, cumpercent = cum) round(evpc, digits = 4) v = eig$vectors round(v, digits = 4) # eigenvectors 特征向量 # principal components 主成分的取值,即数据在各主成分的投影值 xv = as.matrix(x) %*% v round(head(xv), digits = 4) # correlations of the first 3 PC's with the original variables 原始变量与前三个主成分的相关系数 corr = cor(x, xv)[, 1:3] colnames(corr) = c('PC1', 'PC2', 'PC3') round(corr, digits = 4) # proportions of variables explained by first 3 PCs 各变量由前三个主成分解释的比例 cumr2 = apply(corr^2, 1, sum) round(as.matrix(cumr2), digits = 4) graphics.off() par(mfrow = c(2, 2)) # 变量在 PC1、PC2 方向的散点图 ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Second PC", main = "Boston Housing", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14") text(corr[, 1], corr[, 2], label) # 变量在 PC2、PC3 方向的散点图 ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "Third PC", ylab = "Second PC", main = "Boston Housing", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14") text(corr[, 3], corr[, 2], label) # 变量在 PC1、PC3 方向的散点图 ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Third PC", main = "Boston Housing", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("X1", "X2", "X3", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14") text(corr[, 1], corr[, 3], label) # 碎石图 plot(perc, xlab = 'PCs', ylab = 'Proportion of total variances explained by PCs', main = 'Scree Plot', pch = 16, cex = 2, axes = FALSE, ylim = c(0, 0.6)) axis(1, at = 1:13) axis(2, at = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6)) abline(h=0, lty=2) # price x[, 13] greater or less than average, 15 is >= average, 17 is < average, stored in h1 h = x[, 13] h[h >= mean(h)] = 15 h1 = h h1[h1 < mean(x[, 13])] = 17 hr = h1 # mark more expensive houses with red square hr[hr == 15] = "red" # mark cheaper houses with black triangle hr[hr == 17] = "black" # correction for signs of eigenvectors xv = xv * (-1) # 观测值投影在前两个主成分上的散点图:红色矩形框代表高于平均房价的观测值,黑色三角形框代表低于平均房价的观测值 graphics.off() plot(xv[, 1], xv[, 2], pch = h1, col = hr, xlab = "PC1", ylab = "PC2", main = "First vs. Second PC", cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6) abline(h=0, v=0, lty=2, col = 'cyan2') # houses close to the Charles River are indicated with red squares col = xt[, 4] col[col == 1] = "red" col[col == 0] = "black" h2 = xt[, 4] h2[h2 == 1] = 15 h2[h2 == 0] = 17 # 观测值投影在前两个主成分上的散点图:红色矩形框代表位于 Charles 河边的观测值,黑色三角形框代表远离 Charles 河的观测值 graphics.off() plot(xv[, 1], xv[, 2], pch = h2, col = col, xlab = "PC1", ylab = "PC2", main = "First vs. Second PC", cex.axis = 1.2, cex.lab = 1.2, cex.main = 1.6) abline(h=0, v=0, lty=2, col = 'cyan2') ######################################## NPCA of 79 US Companies 美国79家公司的标准化主成分分析 #################### rm(list = ls(all = TRUE)) # clear all variables graphics.off() setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") library(readr) x = read_table("uscomp2.txt", col_name = FALSE) colnames(x) = c("Company", "A", "S", "MV", "P", "CF", "E", "Sector") attach(x) Sector = as.character(Sector) Sector[1:2] = "H" Sector[3:17] = "E" Sector[18:34] = "F" Sector[35:42] = "H" Sector[43:52] = "M" Sector[53:63] = "*" Sector[64:73] = "R" Sector[74:79] = "*" detach(x) head(x) # standardizes the data 数据标准化 x = as.matrix(x[, 2:7]) n = nrow(x) # number of rows x = (x - matrix(apply(x, 2, mean), n, 6, byrow = T)) / matrix(sqrt((n - 1) * apply(x, 2, var)/n), n, 6, byrow = T) round(head(x), digits = 4) # spectral decomposition 相关矩阵的谱分解 eig = eigen((n - 1) * cov(x)/n) e = eig$values round(as.matrix(e), digits = 3) v = eig$vectors round(v, digits = 3) # principal components 主成分取值 x = as.matrix(x) %*% v graphics.off() plot(cbind(-x[, 1], -x[, 2]), type = "n", xlab = "PC 1", ylab = "PC 2", main = "First vs. Second PC", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6) text(cbind(-x[, 1], -x[, 2]), Sector) # scree plot 碎石图 plot(e, xlab = "Index", ylab = "Lambda", main = "Eigenvalues of S", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6, pch = 16, cex = 1.5) abline(h = 0, lty = 2) #### NPCA of 79 US Companies without IBM and GE 不含 IBM 和 GE 的标准化主成分分析 rm(list = ls(all = TRUE)) # clear all variables graphics.off() setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") library(readr) x = read_table("uscomp2.txt", col_name = FALSE) x = rbind(x[1:37, ], x[39, ], x[41:79, ]) colnames(x) = c("Company", "A", "S", "MV", "P", "CF", "E", "Sector") attach(x) Sector = as.character(Sector) Sector[1:2] = "H" Sector[3:17] = "E" Sector[18:34] = "F" Sector[35:40] = "H" Sector[41:50] = "M" Sector[51:61] = "*" Sector[62:71] = "R" Sector[72:77] = "*" detach(x) x = x[, 2:7] n = nrow(x) # standardizes the data 数据标准化 x = (x - matrix(apply(x, 2, mean), n, 6, byrow = T)) / matrix(sqrt((n - 1) * apply(x, 2, var)/n), n, 6, byrow = T) # spectral decomposition 谱分解 eig = eigen((n - 1) * cov(x)/n) e = eig$values round(as.matrix(e), digits = 3) v = eig$vectors round(v, digits = 3) # principal components 主成分值 x = as.matrix(x) %*% v plot(-x[, 1], x[, 2], xlim = c(-2, 8), ylim = c(-8, 8), type = "n", xlab = "PC 1", ylab = "PC 2", main = "First vs. Second PC") text(-x[, 1], x[, 2], Sector) abline(h = 0, v = 0, lty = 2, col = 'orange') # scree plot 碎石图 plot(e, xlab = "Index", ylab = "Lambda", main = "Eigenvalues of S", pch = 16, cex = 1.5) abline(h = 0, lty = 2, col = 'orange') #### Plot for the relative proportion of variance explained by PCs rm(list = ls(all = TRUE)) # clear all variables graphics.off() setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") library(readr) x = read_table("uscomp2.txt", col_name = FALSE) x = rbind(x[1:37, ], x[39, ], x[41:79, ]) x = x[, 2:7] n = nrow(x) x = scale(x) # standardizes the data matrix e = eigen((n - 1) * cov(x)/n) # calculates eigenvalues and eigenvectors e1 = e$values evprop = data.frame(eigenvalues = e1, proportion = e1/sum(e1), cumprop = cumsum(e1)/sum(e1)) round(evprop, digits = 3) r = x %*% e$vectors # principal components r = cor(cbind(r, x)) # correlation between PCs and variables r1 = r[7:12, 1:2] # correlation of the two most important PCs and variables corpcx = data.frame(PC1 = r1[, 1], PC2 = r1[, 2], SquaredSum = r1[, 1]^2 + r1[, 2]^2) rownames(corpcx) = c('X1', 'X2', 'X3', 'X4', 'X5', 'X6') round(corpcx, digits = 2) ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Second PC", main = "U.S. Company Data", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("X1", "X2", "X3", "X4", "X5", "X6") text(-r1[, 1], r1[, 2], label, cex = 1.2) #################### NPCA of USJudgeRatings ################################################################## rm(list = ls(all = TRUE)) # clear all variables graphics.off() library(psych) x = USJudgeRatings head(x) ?USJudgeRatings r = cor(x) # 计算相关矩阵 round(r, digits = 3) NPCA.x = princomp(x, cor = TRUE, scores = TRUE, fix_sign = TRUE) # 标准化主成分分析 str(NPCA.x) # princomp 结果中的内容 round(NPCA.x$sdev, digits = 3) # 主成分标准差(方差的平方根) round(NPCA.x$loadings, digits = 3) # 主成分载荷 round(NPCA.x$loadings[,1], digits = 3) # 第一主成分在原始变量的权重 round(NPCA.x$loadings[,2], digits = 3) # 第二主成分在原始变量的权重 screeplot(NPCA.x, main = 'Scree Plot of NPCA for USJudgeRatings') # 碎石图 round(NPCA.x$center, digits = 3) # 各个变量的均值(中心) round(NPCA.x$scale, digits = 3) # 作用于各个变量的尺度 NPCA.x$n.obs # 观测值的个数 round(NPCA.x$scores, digits = 3) # 观测值在各主成分的投影值 scores_x = cbind(NPCA.x$scores, x) # 主成分值与原始数据值的合并数据矩阵 cor.NPC.x = cor(scores_x) # 主成分与原始变量的相关系数 cor.NPC.x.2 = cor.NPC.x[13:24, 1:2] # 原始变量与前两个主成分的相关系数 round(cor.NPC.x.2, digits = 3) # 变量在前两个主方向的散点图 graphics.off() ucircle = cbind(cos((0:360)/180 * pi), sin((0:360)/180 * pi)) plot(ucircle, type = "l", lty = "solid", col = "blue", xlab = "First PC", ylab = "Second PC", main = "USJusgeRatings", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8, lwd = 2, asp = 1) abline(h = 0, v = 0) label = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12") # label = row.names(cor.NPC.x.2) text(-cor.NPC.x.2[, 1], cor.NPC.x.2[, 2], label, cex = 1.0) # 观测值在前两个主成分的散点图 graphics.off() plot(cbind(NPCA.x$scores[, 1], NPCA.x$scores[, 2]), type = "p", xlab = "PC 1", ylab = "PC 2", pch = 16, main = "First NPC vs. Second NPC", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6) abline(h = 0, v = 0, lty = 2) graphics.off() plot(cbind(NPCA.x$scores[, 1], NPCA.x$scores[, 2]), type = "n", xlab = "PC 1", ylab = "PC 2", main = "First NPC vs. Second NPC", cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.6) abline(h = 0, v = 0, lty = 2) label = 1:NPCA.x$n.obs text(NPCA.x$scores[, 1], NPCA.x$scores[, 2], label) # 前两个主成分得分按照方差的加权平均,对所有观测值(法官)排序 judge = (NPCA.x$sdev[1]^2 * NPCA.x$scores[, 1] + NPCA.x$sdev[2]^2 * NPCA.x$scores[, 2]) / (NPCA.x$sdev[1]^2 + NPCA.x$sdev[2]^2) rank = rank(judge) # 综合排序 rank sort.judge = cbind(NPCA.x$scores[, 1], NPCA.x$scores[, 2], judge, rank) sort.judge