#### ---------- ---------- ---------- 判别分析 ---------- ---------- ---------- rm(list = ls(all = TRUE)) # clear all variables graphics.off() library(ggplot2) library(plotly) x = seq(-5, 5, length = 201) y1 = dnorm(x, mean = -1, sd = 1) y2 = dnorm(x, mean = 2, sd = 1/2) R1_x = c(0.9, 0.9, -5, -5, 0.9) R1_y = c(0, 0.9, 0.9, 0, 0) R2_x = c(0.9, 0.9, 5, 5, 0.9) R2_y = c(0, 0.9, 0.9, 0, 0) Fig_1 = plot_ly(x = ~x, y = ~y1, name = "N(-1, 1)") %>% add_polygons(x = ~R1_x, y = ~R1_y, name = "") %>% add_polygons(x = ~R2_x, y = ~R2_y, name = "") %>% add_lines() %>% add_lines(x = ~x, y = ~y2, name = "N(2, 1/2)") %>% add_text(x = -1, y = 0.43, size = I(100), text = TeX("\\Pi_1"), textposition = "center") %>% add_text(x = 2.5, y = 0.7, size = I(100), text = TeX("\\Pi_2"), textposition = "center") %>% add_text(x = -3, y = 0.6, size = I(100), text = TeX("R_1"), textposition = "center") %>% add_text(x = 4, y = 0.6, size = I(100), text = TeX("R_2"), textposition = "center") %>% config(mathjax = "cdn") Fig_1 ## ---------- Car Mark data set ---------- rm(list = ls(all = TRUE)) graphics.off() library(mclust) x = banknote x[c(1:3, 101:103), ] ## ---------- 两个正态总体 ---------- rm(list = ls(all = TRUE)) ## 方差不等 s1 = 1 mu1 = 0 s2 = 0.5 mu2 = 1 curve(dnorm(x, mu1, s1), -3, 3, col = "red", ylim = c(0, 0.8), axes = FALSE, xlab = "", ylab = "Densities", lwd = 2) axis(1, at = -3:3, pos = 0) axis(2, at = c(0, 0.2, 0.4, 0.6, 0.8), pos = -3) curve(dnorm(x, mu2, s2), -1, 3, col = "blue", add = TRUE, lty = 2, lwd = 2) title(main = "Two Normal Distributions") abline(v = c(0.38, 2.28), lwd = 3) text(-1, 0.5, "R1", col = "red", cex = 2) text(2.6, 0.5, "R1", col = "red", cex = 2) text(1.5, 0.5, "R2", col = "green4", cex = 2) ## 方差相等 s1 = 1 mu1 = 0 s2 = 1 mu2 = 3 curve(dnorm(x, mu1, s1), -3, 3, col = "red", xlim = c(-3, 6), ylim = c(0, 0.4), axes = FALSE, xlab = "", ylab = "Densities", lwd = 2) axis(1, at = -3:6, pos = 0) axis(2, at = c(0, 0.1, 0.2, 0.3, 0.4), pos = -3) curve(dnorm(x, mu2, s2), 0, 6, col = "blue", add = TRUE, lty = 2, lwd = 2) title(main = "Two Normal Distributions") abline(v = 1.5, lwd = 3) text(0, 0.2, "R1", col = "red", cex = 2) text(3, 0.2, "R2", col = "green4", cex = 2) ## ---------- 真伪钞票数据集 20 个观测值的判别 ---------- rm(list = ls(all = TRUE)) graphics.off() library(mclust) data(banknote) set.seed(1993) n0 = sample(1:200, size = 20, replace = FALSE) x = banknote[n0, ] data.g = subset(x, Status == "genuine", select = 2:7) # 真钞子集 data.g data.f = subset(x, Status == "counterfeit", select = 2:7) # 伪钞子集 data.f mu.g = apply(data.g, 2, mean) # 真钞的均值向量 mu.g mu.f = apply(data.f, 2, mean) # 伪钞的均值向量 mu.f s.g = cov(data.g) * (dim(data.g)[1] - 1) / (dim(data.g)[1]) # 真钞的样本协方差矩阵 round(s.g, digits = 4) s.f = cov(data.f) * (dim(data.f)[1] - 1) / (dim(data.f)[1]) # 伪钞的样本协方差矩阵 round(s.f, digits = 4) su = ((dim(data.g)[1]) * s.g + (dim(data.f)[1]) * s.f) / (20 - 2) # 合并计算的样本协方差矩阵 round(su, digits = 4) mu.g = as.matrix(mu.g) mu.f = as.matrix(mu.f) su = as.matrix(su) alpha = solve(su) %*% (mu.g - mu.f) # 计算划分两个总体的超平面用到的 alpha round(alpha, digits = 4) mu = (mu.g + mu.f) / 2 # 计算划分两个总体的超平面用到的 mu round(mu, digits = 4) data.x = as.matrix(banknote[, 2:7]) # 真伪钞数据集全体 MU = matrix(mu, nrow = dim(data.x)[1], ncol = 6, byrow = TRUE) # 均值向量构造的矩阵 disc.f = (data.x - MU) %*% alpha # 计算每个体的判别函数值:正判为真钞、负判为伪钞 disc.f[1:100, ] # 真钞的判别函数值 sum(disc.f[1:100, 1] < 0) # 真钞判为伪钞的数量 disc.f[101:200, ] # 伪钞的判别函数值 sum(disc.f[101:200, 1] > 0) # 伪钞判为真钞的数量 ## ---------- 真伪钞票数据集的 Fisher 判别 ---------- rm(list = ls(all = TRUE)) graphics.off() library(mclust) data(banknote) xg = as.matrix(banknote[1:100, 2:7]) # 真钞数据 xf = as.matrix(banknote[101:200, 2:7]) # 伪钞数据 mean.xg = as.matrix(apply(xg, 2, mean)) # 真钞数据的均值 mean.xf = as.matrix(apply(xf, 2, mean)) # 伪钞数据的均值 m = (mean.xg + mean.xf)/2 # 两个总体均值的平均 w = 100 * (cov(xg) + cov(xf)) # 组内平方和对应的矩阵 d = mean.xg - mean.xf # 均值差 a = solve(w) %*% d # 确定线性组合(投影)的系数 round(a, digits = 3) yg = as.matrix(xg - matrix(m, nrow = 100, ncol = 6, byrow = T)) %*% a # 计算真钞数据的投影值 yf = as.matrix(xf - matrix(m, nrow = 100, ncol = 6, byrow = T)) %*% a # 计算伪钞数据的投影值 xgtest = yg sg = sum(xgtest < 0) # 真钞判归伪钞的数量 sg xftest = yf sf = sum(xftest > 0) # 伪钞判归真钞的数量 sf fg = density(yg) # 真钞投影值的核密度估计 ff = density(yf) # 伪钞投影值的核密度估计 # 绘制两个总体投影值核密度估计曲线 plot(ff, lwd = 3, col = "red", xlab = "", ylab = "Densities of Projections", main = "Densities of Projections of Swiss bank notes", axes = FALSE, xlim = c(-0.2, 0.2), cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8) lines(fg, lwd = 3, col = "blue", lty = 2) text(mean(yf), 3.72, "Forged", col = "red") text(mean(yg), 2.72, "Genuine", col = "blue") axis(1, pos = 0) axis(2, pos = -0.2) ## ---------- iris 数据集的判别分析 - Fisher 线性判别 ---------- rm(list = ls(all = TRUE)) graphics.off() options(digits = 3) library(MASS) str(iris) head(iris) x = iris[, 1:4] # 读入数据 ?gl g = gl(n = 3, k = 50) # 生成有三类、每类 50 个对象的分类因子 g x = cbind(x, Type = g) # 将分类因子 g 添加到数据中 head(x) attach(x) x.lda = lda(Type ~ Petal.Length + Petal.Width + Sepal.Length + Sepal.Width) # 方差相等下的 Bayes 判别 x.lda # 显示结果 plot(x.lda, dimen = 1) # 绘制一个线性判别函数的图形 colors = rep(1:3, times = c(50, 50, 50)) plot(x.lda, cex = 1.5, dimen = 2, col = colors) # 绘制两个线性判别函数的图形 x.pred = predict(x.lda) # 回判 x.pred$class # 显示回判的结果 x.pred$posterior # 后验概率 x.pred$x # 线性判别函数 (投影) 的取值 x.alloc = cbind(Type = g, x.pred$x, Allocations = x.pred$class) # 真实类型、判别类型对照 (含判别函数) x.alloc x.confu = table(g, x.pred$class) # 混淆矩阵 x.confu ?prop.table x.prob = prop.table(x.confu) # 混淆矩阵对应的比例 x.prob sum(diag(x.prob)) # 判对率 1 - sum(diag(x.prob)) # 错判率 x.alloc = as.data.frame(x.alloc) ?ldahist # 多组数据的直方图或核密度图 ldahist(x.alloc$LD1, g, type = "b") # 第一线性判别函数(投影)值的直方图与核密度 ldahist(x.alloc$LD2, g, type = "b") # 第二线性判别函数(投影)值的直方图与核密度 pairs(x.lda, cex = 1.5, col = colors) # 线性判别的二维散点图 detach(x) ## ---------- iris 数据集的判别分析 - 二次判别函数 ---------- rm(list = ls(all = TRUE)) graphics.off() options(digits = 3) library(MASS) x = iris[, 1:4] # 读入数据 g = gl(n = 3, k = 50) # 生成有三类、每类 50 个对象的分类因子 x = cbind(x, Type = g) # 将分类因子 g 添加到数据中 attach(x) x.qda = qda(Type ~ Petal.Length + Petal.Width + Sepal.Length + Sepal.Width) # 方差不等条件下的二次 Bayes 判别 x.qda # 显示结果 x.qpred = predict(x.qda) # 回判 x.qpred$class # 回判结果 round(x.qpred$posterior, digits = 2) # 回判的后验概率 x.qalloc = cbind(Type = g, Allocations = x.qpred$class) # 真实类型、判别类型对照 x.qconfu = table(g, x.qpred$class) # 混淆矩阵 x.qconfu x.qprob = prop.table(x.qconfu) # 混淆矩阵对应的比例 x.qprob sum(diag(x.qprob)) # 判对率 1 - sum(diag(x.qprob)) # 错判率 detach(x) ## ---------- 健康人群与心梗患者的判别分析 ---------- rm(list = ls(all = TRUE)) graphics.off() library(MASS) library(data.table) setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data") x = fread("myocardial.csv", header = TRUE) # 读入数据 x x = as.data.frame(x[, 2:5]) x$Type = as.factor(x$Type) x attach(x) x.lda = lda(Type ~ X1 + X2 + X3) # 方差相等条件下的 Bayes 判别 x.lda # 显示判别结果 plot(x.lda, dimen = 1, type = "b") # 绘制线性判别函数的图形 x.pred = predict(x.lda) # 回判 x.pred$class # 显示回判的结果 round(x.pred$posterior, digits = 2) # 后验概率 x.pred$x # 线性判别函数 (投影) 的取值 x.alloc = cbind(x, x.pred$x, Allocations = x.pred$class) # 真实类型、判别类型对照 (含判别函数) x.alloc x.confu = table(x$Type, x.pred$class) # 混淆矩阵 x.confu x.prob = prop.table(x.confu) # 混淆矩阵对应的比例 x.prob sum(diag(x.prob)) # 判对率 1 - sum(diag(x.prob)) # 错判率 x.new = data.frame(X1 = c(420.50), X2 = c(32.42), X3 = c(1.98)) # 新的观测数据 x.new x.new.pred = predict(x.lda, newdata = x.new) # 利用上述判别分析模型的预测 x.new.pred # 查看结果 x.new.pred$posterior # 查看后验概率:属于第2类的概率 0.78,判此人为心梗患者 detach(x)