Chapter_14.R · 芝士酷

#### ---------- ---------- ---------- 判别分析 ---------- ---------- ----------
rm(list = ls(all = TRUE)) # clear all variables
graphics.off()
library(ggplot2)
library(plotly)

x = seq(-5, 5, length = 201)
y1 = dnorm(x, mean = -1, sd = 1)
y2 = dnorm(x, mean = 2, sd = 1/2)

R1_x = c(0.9, 0.9, -5, -5, 0.9)
R1_y = c(0, 0.9, 0.9, 0, 0)
R2_x = c(0.9, 0.9, 5, 5, 0.9)
R2_y = c(0, 0.9, 0.9, 0, 0)

Fig_1 = plot_ly(x = ~x, y = ~y1, name = "N(-1, 1)") %>%
  add_polygons(x = ~R1_x, y = ~R1_y, name = "") %>%
  add_polygons(x = ~R2_x, y = ~R2_y, name = "") %>%
  add_lines() %>%
  add_lines(x = ~x, y = ~y2, name = "N(2, 1/2)") %>%
  add_text(x = -1, y = 0.43, size = I(100),
           text = TeX("\\Pi_1"), 
           textposition = "center") %>%
  add_text(x = 2.5, y = 0.7, size = I(100),
           text = TeX("\\Pi_2"), 
           textposition = "center") %>%
  add_text(x = -3, y = 0.6, size = I(100),
           text = TeX("R_1"), 
           textposition = "center") %>%
  add_text(x = 4, y = 0.6, size = I(100),
           text = TeX("R_2"), 
           textposition = "center") %>%
  config(mathjax = "cdn")
Fig_1

## ---------- Car Mark data set ----------
rm(list = ls(all = TRUE))
graphics.off()
library(mclust)
x = banknote
x[c(1:3, 101:103), ]

## ---------- 两个正态总体 ----------
rm(list = ls(all = TRUE))

## 方差不等
s1 = 1
mu1 = 0
s2 = 0.5
mu2	= 1
curve(dnorm(x, mu1, s1), -3, 3, col = "red", ylim = c(0, 0.8), axes = FALSE, xlab = "", ylab = "Densities", lwd = 2)
axis(1, at = -3:3, pos = 0)
axis(2, at = c(0, 0.2, 0.4, 0.6, 0.8), pos = -3)
curve(dnorm(x, mu2, s2), -1, 3, col = "blue", add = TRUE, lty = 2, lwd = 2)
title(main = "Two Normal Distributions")
abline(v = c(0.38, 2.28), lwd = 3)
text(-1, 0.5, "R1", col = "red", cex = 2)
text(2.6, 0.5, "R1", col = "red", cex = 2)
text(1.5, 0.5, "R2", col = "green4", cex = 2)

## 方差相等
s1 = 1
mu1 = 0
s2 = 1
mu2	= 3
curve(dnorm(x, mu1, s1), -3, 3, col = "red", xlim = c(-3, 6), ylim = c(0, 0.4), axes = FALSE, 
      xlab = "", ylab = "Densities", lwd = 2)
axis(1, at = -3:6, pos = 0)
axis(2, at = c(0, 0.1, 0.2, 0.3, 0.4), pos = -3)
curve(dnorm(x, mu2, s2), 0, 6, col = "blue", add = TRUE, lty = 2, lwd = 2)
title(main = "Two Normal Distributions")
abline(v = 1.5, lwd = 3)
text(0, 0.2, "R1", col = "red", cex = 2)
text(3, 0.2, "R2", col = "green4", cex = 2)

## ---------- 真伪钞票数据集 20 个观测值的判别 ----------
rm(list = ls(all = TRUE))
graphics.off()
library(mclust)
data(banknote)
set.seed(1993)
n0 = sample(1:200, size = 20, replace = FALSE)
x = banknote[n0, ]
data.g = subset(x, Status == "genuine", select = 2:7) # 真钞子集
data.g
data.f = subset(x, Status == "counterfeit", select = 2:7) # 伪钞子集
data.f

mu.g = apply(data.g, 2, mean) # 真钞的均值向量
mu.g
mu.f = apply(data.f, 2, mean) # 伪钞的均值向量
mu.f

s.g = cov(data.g) * (dim(data.g)[1] - 1) / (dim(data.g)[1]) # 真钞的样本协方差矩阵
round(s.g, digits = 4)
s.f = cov(data.f) * (dim(data.f)[1] - 1) / (dim(data.f)[1]) # 伪钞的样本协方差矩阵
round(s.f, digits = 4)
su = ((dim(data.g)[1]) * s.g + (dim(data.f)[1]) * s.f) / (20 - 2) # 合并计算的样本协方差矩阵
round(su, digits = 4)

mu.g = as.matrix(mu.g)
mu.f = as.matrix(mu.f)
su = as.matrix(su)
alpha = solve(su) %*% (mu.g - mu.f) # 计算划分两个总体的超平面用到的 alpha
round(alpha, digits = 4)
mu = (mu.g + mu.f) / 2 # 计算划分两个总体的超平面用到的 mu
round(mu, digits = 4)

data.x = as.matrix(banknote[, 2:7]) # 真伪钞数据集全体
MU = matrix(mu, nrow = dim(data.x)[1], ncol = 6, byrow = TRUE) # 均值向量构造的矩阵
disc.f = (data.x - MU) %*% alpha # 计算每个体的判别函数值：正判为真钞、负判为伪钞
disc.f[1:100, ] # 真钞的判别函数值
sum(disc.f[1:100, 1] < 0) # 真钞判为伪钞的数量
disc.f[101:200, ] # 伪钞的判别函数值
sum(disc.f[101:200, 1] > 0) # 伪钞判为真钞的数量

## ---------- 真伪钞票数据集的 Fisher 判别 ----------
rm(list = ls(all = TRUE))
graphics.off()
library(mclust)
data(banknote)
xg = as.matrix(banknote[1:100, 2:7]) # 真钞数据
xf = as.matrix(banknote[101:200, 2:7]) # 伪钞数据
mean.xg = as.matrix(apply(xg, 2, mean)) # 真钞数据的均值
mean.xf = as.matrix(apply(xf, 2, mean)) # 伪钞数据的均值

m = (mean.xg + mean.xf)/2 # 两个总体均值的平均
w = 100 * (cov(xg) + cov(xf)) # 组内平方和对应的矩阵
d = mean.xg - mean.xf # 均值差
a = solve(w) %*% d # 确定线性组合（投影）的系数
round(a, digits = 3)

yg = as.matrix(xg - matrix(m, nrow = 100, ncol = 6, byrow = T)) %*% a # 计算真钞数据的投影值
yf = as.matrix(xf - matrix(m, nrow = 100, ncol = 6, byrow = T)) %*% a # 计算伪钞数据的投影值

xgtest = yg
sg = sum(xgtest < 0) # 真钞判归伪钞的数量
sg

xftest = yf
sf = sum(xftest > 0) # 伪钞判归真钞的数量
sf

fg = density(yg) # 真钞投影值的核密度估计
ff = density(yf) # 伪钞投影值的核密度估计

# 绘制两个总体投影值核密度估计曲线
plot(ff, lwd = 3, col = "red", xlab = "", ylab = "Densities of Projections", 
     main = "Densities of Projections of Swiss bank notes", axes = FALSE,
     xlim = c(-0.2, 0.2), cex.lab = 1.2, cex.axis = 1.2, cex.main = 1.8)
lines(fg, lwd = 3, col = "blue", lty = 2)
text(mean(yf), 3.72, "Forged", col = "red")
text(mean(yg), 2.72, "Genuine", col = "blue")
axis(1, pos = 0)
axis(2, pos = -0.2)

## ---------- iris 数据集的判别分析 - Fisher 线性判别 ----------
rm(list = ls(all = TRUE))
graphics.off()
options(digits = 3)

library(MASS)
str(iris)
head(iris)
x = iris[, 1:4] # 读入数据

?gl
g = gl(n = 3, k = 50) # 生成有三类、每类 50 个对象的分类因子
g
x = cbind(x, Type = g) # 将分类因子 g 添加到数据中
head(x)

attach(x)
x.lda = lda(Type ~ Petal.Length + Petal.Width + Sepal.Length + Sepal.Width) # 方差相等下的 Bayes 判别
x.lda # 显示结果

plot(x.lda, dimen = 1) # 绘制一个线性判别函数的图形

colors = rep(1:3, times = c(50, 50, 50))
plot(x.lda, cex = 1.5, dimen = 2, col = colors) # 绘制两个线性判别函数的图形

x.pred = predict(x.lda) # 回判
x.pred$class # 显示回判的结果
x.pred$posterior # 后验概率
x.pred$x # 线性判别函数 (投影) 的取值

x.alloc = cbind(Type = g, x.pred$x, Allocations = x.pred$class) # 真实类型、判别类型对照 (含判别函数)
x.alloc

x.confu = table(g, x.pred$class) # 混淆矩阵
x.confu

?prop.table
x.prob = prop.table(x.confu) # 混淆矩阵对应的比例
x.prob

sum(diag(x.prob)) # 判对率
1 - sum(diag(x.prob)) # 错判率

x.alloc = as.data.frame(x.alloc)
?ldahist # 多组数据的直方图或核密度图
ldahist(x.alloc$LD1, g, type = "b") # 第一线性判别函数(投影)值的直方图与核密度
ldahist(x.alloc$LD2, g, type = "b") # 第二线性判别函数(投影)值的直方图与核密度

pairs(x.lda, cex = 1.5, col = colors) # 线性判别的二维散点图
detach(x)
## ---------- iris 数据集的判别分析 - 二次判别函数 ----------
rm(list = ls(all = TRUE))
graphics.off()
options(digits = 3)

library(MASS)
x = iris[, 1:4] # 读入数据

g = gl(n = 3, k = 50) # 生成有三类、每类 50 个对象的分类因子
x = cbind(x, Type = g) # 将分类因子 g 添加到数据中

attach(x)
x.qda = qda(Type ~ Petal.Length + Petal.Width + Sepal.Length + Sepal.Width) # 方差不等条件下的二次 Bayes 判别
x.qda # 显示结果

x.qpred = predict(x.qda) # 回判
x.qpred$class # 回判结果
round(x.qpred$posterior, digits = 2) # 回判的后验概率

x.qalloc = cbind(Type = g, Allocations = x.qpred$class) # 真实类型、判别类型对照
x.qconfu = table(g, x.qpred$class) # 混淆矩阵
x.qconfu

x.qprob = prop.table(x.qconfu) # 混淆矩阵对应的比例
x.qprob
sum(diag(x.qprob)) # 判对率
1 - sum(diag(x.qprob)) # 错判率
detach(x)

## ---------- 健康人群与心梗患者的判别分析 ----------
rm(list = ls(all = TRUE))
graphics.off()
library(MASS)
library(data.table)
setwd("~/Desktop/2026_Multivariate Statistical Analysis/R_Code/Data")
x = fread("myocardial.csv", header = TRUE) # 读入数据
x
x = as.data.frame(x[, 2:5])
x$Type = as.factor(x$Type)
x

attach(x)
x.lda = lda(Type ~ X1 + X2 + X3) # 方差相等条件下的 Bayes 判别
x.lda # 显示判别结果

plot(x.lda, dimen = 1, type = "b") # 绘制线性判别函数的图形

x.pred = predict(x.lda) # 回判
x.pred$class # 显示回判的结果
round(x.pred$posterior, digits = 2) # 后验概率
x.pred$x # 线性判别函数 (投影) 的取值

x.alloc = cbind(x, x.pred$x, Allocations = x.pred$class) # 真实类型、判别类型对照 (含判别函数)
x.alloc

x.confu = table(x$Type, x.pred$class) # 混淆矩阵
x.confu

x.prob = prop.table(x.confu) # 混淆矩阵对应的比例
x.prob

sum(diag(x.prob)) # 判对率
1 - sum(diag(x.prob)) # 错判率

x.new = data.frame(X1 = c(420.50), X2 = c(32.42), X3 = c(1.98)) # 新的观测数据
x.new
x.new.pred = predict(x.lda, newdata = x.new) # 利用上述判别分析模型的预测
x.new.pred # 查看结果
x.new.pred$posterior # 查看后验概率：属于第2类的概率 0.78，判此人为心梗患者
detach(x)