mvsa-final-report-analysis.R

rm(list = ls(all = TRUE))
graphics.off()

set.seed(20260607)
options(digits = 4)

data_file <- "Default of Credit Card Clients.csv"
fig_dir <- "mvsa_figures"
dir.create(fig_dir, showWarnings = FALSE)

## ---------- utility functions ----------
split_stratified <- function(y, prop = 0.8, seed = 1) {
  set.seed(seed)
  idx <- unlist(lapply(split(seq_along(y), y), function(ind) {
    sample(ind, floor(length(ind) * prop))
  }))
  sort(idx)
}

confusion_metrics <- function(actual, pred, positive = 1) {
  actual <- as.integer(actual)
  pred <- as.integer(pred)
  tab <- table(
    Actual = factor(actual, levels = c(0, 1)),
    Predicted = factor(pred, levels = c(0, 1))
  )
  acc <- sum(diag(tab)) / sum(tab)
  sens <- tab["1", "1"] / sum(tab["1", ])
  spec <- tab["0", "0"] / sum(tab["0", ])
  precision <- ifelse(sum(tab[, "1"]) == 0, NA, tab["1", "1"] / sum(tab[, "1"]))
  data.frame(
    Accuracy = acc,
    Sensitivity = sens,
    Specificity = spec,
    Precision = precision,
    ErrorRate = 1 - acc
  )
}

best_cluster_metrics <- function(actual, cluster) {
  cluster <- as.integer(as.factor(cluster))
  pred0 <- cluster - 1
  m0 <- confusion_metrics(actual, pred0)
  m1 <- confusion_metrics(actual, 1 - pred0)
  if (m0$Accuracy >= m1$Accuracy) {
    list(pred = pred0, metrics = m0, table = table(Actual = actual, Cluster = pred0))
  } else {
    list(pred = 1 - pred0, metrics = m1, table = table(Actual = actual, Cluster = 1 - pred0))
  }
}

auc_rank <- function(y, score) {
  y <- as.integer(y)
  r <- rank(score, ties.method = "average")
  n1 <- sum(y == 1)
  n0 <- sum(y == 0)
  (sum(r[y == 1]) - n1 * (n1 + 1) / 2) / (n1 * n0)
}

round_df <- function(x, digits = 4) {
  x <- as.data.frame(x)
  num <- vapply(x, is.numeric, logical(1))
  x[num] <- lapply(x[num], round, digits)
  x
}

write_table <- function(name, x) {
  cat("\n====", name, "====\n")
  print(x)
}

## ---------- read and preprocess data ----------
data <- read.csv(data_file, check.names = FALSE)
X <- data[, paste0("X", 1:23)]
Y <- data$Y
X_scaled <- scale(X)
y_factor <- factor(Y, levels = c(0, 1), labels = c("NoDefault", "Default"))

out <- file("mvsa_numeric_results.txt", open = "wt", encoding = "UTF-8")
sink(out)

cat("Data dimension:", paste(dim(data), collapse = " x "), "\n")
cat("Missing values:", sum(is.na(data)), "\n")
cat("Y table:\n")
print(table(Y))
cat("Y proportion:\n")
print(round(prop.table(table(Y)), 4))

desc_selected <- data.frame(
  Variable = names(X),
  Mean = sapply(X, mean),
  SD = sapply(X, sd),
  Min = sapply(X, min),
  Median = sapply(X, median),
  Max = sapply(X, max)
)
write_table("Descriptive statistics", round_df(desc_selected, 2))

## ---------- descriptive plots ----------
png(file.path(fig_dir, "desc_y_bar.png"), width = 1100, height = 760, res = 140)
barplot(table(Y),
        col = c("#80B1D3", "#FB8072"),
        names.arg = c("未违约(0)", "违约(1)"),
        main = "响应变量 Y 的类别分布",
        ylab = "人数")
text(x = c(0.7, 1.9), y = as.numeric(table(Y)) + 500,
     labels = paste0(as.numeric(table(Y)), "人"), cex = 1.1)
dev.off()

png(file.path(fig_dir, "desc_boxplot_keyvars.png"), width = 1200, height = 820, res = 140)
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
boxplot(X1 ~ Y, data = data, col = c("#B3DE69", "#FDB462"),
        names = c("未违约", "违约"), main = "授信额度 X1", ylab = "新台币")
boxplot(X5 ~ Y, data = data, col = c("#B3DE69", "#FDB462"),
        names = c("未违约", "违约"), main = "年龄 X5", ylab = "岁")
boxplot(X6 ~ Y, data = data, col = c("#B3DE69", "#FDB462"),
        names = c("未违约", "违约"), main = "9月还款状态 X6", ylab = "状态")
boxplot(X18 ~ Y, data = data, col = c("#B3DE69", "#FDB462"),
        names = c("未违约", "违约"), main = "9月还款金额 X18", ylab = "新台币")
dev.off()

## ---------- Logistic regression ----------
train80 <- split_stratified(Y, 0.8, 20260607)
train_log <- data[train80, c(paste0("X", 1:23), "Y")]
test_log <- data[-train80, c(paste0("X", 1:23), "Y")]
log_fit <- glm(Y ~ ., data = train_log, family = binomial(), control = glm.control(maxit = 100))
log_prob <- predict(log_fit, newdata = test_log, type = "response")
log_pred <- ifelse(log_prob >= 0.5, 1, 0)
log_cm <- table(
  Actual = factor(test_log$Y, levels = c(0, 1)),
  Predicted = factor(log_pred, levels = c(0, 1))
)
log_metrics <- confusion_metrics(test_log$Y, log_pred)
log_auc <- auc_rank(test_log$Y, log_prob)
coef_tab <- summary(log_fit)$coefficients
coef_key <- coef_tab[-1, , drop = FALSE]
top_coef <- coef_key[order(abs(coef_key[, "z value"]), decreasing = TRUE)[1:8], ]

write_table("Logistic confusion matrix", log_cm)
write_table("Logistic metrics", round_df(log_metrics, 4))
cat("Logistic AUC:", round(log_auc, 4), "\n")
write_table("Top logistic coefficients by |z|", round_df(top_coef, 4))

## ---------- PCA ----------
pca <- prcomp(X_scaled, center = FALSE, scale. = FALSE)
eig <- pca$sdev^2
pca_prop <- eig / sum(eig)
pca_cum <- cumsum(pca_prop)
k_eig1 <- sum(eig > 1)
k_80 <- which(pca_cum >= 0.80)[1]
pca_table <- data.frame(
  PC = paste0("PC", 1:10),
  Eigenvalue = eig[1:10],
  Proportion = pca_prop[1:10],
  Cumulative = pca_cum[1:10]
)
write_table("PCA variance table", round_df(pca_table, 4))
cat("PCA k eigenvalue>1:", k_eig1, "\n")
cat("PCA k cumulative>=80%:", k_80, "\n")

pca_cor12 <- sweep(pca$rotation[, 1:2], 2, pca$sdev[1:2], "*")
colnames(pca_cor12) <- c("PC1", "PC2")
pca_top <- data.frame(
  PC1_positive = names(sort(pca_cor12[, 1], decreasing = TRUE))[1:5],
  PC1_negative = names(sort(pca_cor12[, 1], decreasing = FALSE))[1:5],
  PC2_positive = names(sort(pca_cor12[, 2], decreasing = TRUE))[1:5],
  PC2_negative = names(sort(pca_cor12[, 2], decreasing = FALSE))[1:5]
)
write_table("PCA top variables", pca_top)

png(file.path(fig_dir, "pca_scree.png"), width = 1200, height = 820, res = 140)
par(mar = c(5, 5, 4, 5))
barplot(pca_prop, col = "#9ECAE1", border = "#3182BD",
        names.arg = 1:length(pca_prop),
        xlab = "主成分序号", ylab = "方差贡献率",
        main = "PCA 碎石图与累计贡献率")
lines(seq_along(pca_cum), pca_cum, type = "b", pch = 19, col = "#E6550D")
axis(4, at = seq(0, 1, 0.2), labels = seq(0, 1, 0.2))
mtext("累计贡献率", side = 4, line = 3)
abline(h = 0.8, lty = 2, col = "gray40")
dev.off()

png(file.path(fig_dir, "pca_variable_pc12.png"), width = 1000, height = 1000, res = 140)
plot(cos(seq(0, 2*pi, length.out = 361)), sin(seq(0, 2*pi, length.out = 361)),
     type = "l", asp = 1, xlab = "PC1", ylab = "PC2",
     main = "变量在前两个主成分平面上的投影",
     col = "#3182BD", xlim = c(-1, 1), ylim = c(-1, 1))
abline(h = 0, v = 0, lty = 2, col = "gray60")
points(pca_cor12[, 1], pca_cor12[, 2], pch = 19, col = "#E6550D")
text(pca_cor12[, 1], pca_cor12[, 2], labels = rownames(pca_cor12), pos = 3, cex = 0.8)
dev.off()

sample_plot <- split_stratified(Y, prop = min(1, 5000 / length(Y)), seed = 20260607)
png(file.path(fig_dir, "pca_observation_pc12.png"), width = 1200, height = 900, res = 140)
plot(pca$x[sample_plot, 1], pca$x[sample_plot, 2],
     col = ifelse(Y[sample_plot] == 1, "#D73027AA", "#4575B4AA"),
     pch = 16, xlab = "PC1 score", ylab = "PC2 score",
     main = "观测在前两个主成分平面上的散点图")
legend("topright", legend = c("未违约", "违约"),
       col = c("#4575B4", "#D73027"), pch = 16, bty = "n")
abline(h = 0, v = 0, lty = 2, col = "gray70")
dev.off()

## ---------- Factor analysis ----------
fa_k <- k_eig1
fa_fit <- factanal(X_scaled, factors = fa_k, rotation = "varimax", scores = "regression")
fa_load <- as.matrix(fa_fit$loadings)
fa_comm <- rowSums(fa_load^2)
fa_var <- colSums(fa_load^2) / ncol(X)
fa_table <- data.frame(
  Variable = rownames(fa_load),
  round(fa_load[, 1:min(6, ncol(fa_load)), drop = FALSE], 3),
  Communality = round(fa_comm, 3),
  Uniqueness = round(fa_fit$uniquenesses, 3)
)
write_table("Factor analysis loadings", fa_table)
cat("Factor variance proportions:\n")
print(round(fa_var, 4))
cat("Factor cumulative variance:", round(cumsum(fa_var), 4), "\n")

fa_interpret <- lapply(1:ncol(fa_load), function(j) {
  v <- names(sort(abs(fa_load[, j]), decreasing = TRUE))[1:5]
  paste(v, collapse = ", ")
})
names(fa_interpret) <- paste0("Factor", 1:ncol(fa_load))
write_table("Factor top variables", as.data.frame(fa_interpret))

png(file.path(fig_dir, "fa_variable_factor12.png"), width = 1000, height = 900, res = 140)
plot(fa_load[, 1], fa_load[, 2], type = "n",
     xlab = "Factor 1 loading", ylab = "Factor 2 loading",
     main = "变量在前两个公共因子平面上的投影",
     xlim = c(-1, 1), ylim = c(-1, 1), asp = 1)
abline(h = 0, v = 0, lty = 2, col = "gray60")
points(fa_load[, 1], fa_load[, 2], pch = 19, col = "#756BB1")
text(fa_load[, 1], fa_load[, 2], rownames(fa_load), pos = 3, cex = 0.8)
dev.off()

fa_scores <- fa_fit$scores
png(file.path(fig_dir, "fa_score_factor12.png"), width = 1200, height = 900, res = 140)
plot(fa_scores[sample_plot, 1], fa_scores[sample_plot, 2],
     col = ifelse(Y[sample_plot] == 1, "#D73027AA", "#4575B4AA"),
     pch = 16, xlab = "Factor 1 score", ylab = "Factor 2 score",
     main = "观测在前两个公共因子得分平面上的散点图")
legend("topright", legend = c("未违约", "违约"),
       col = c("#4575B4", "#D73027"), pch = 16, bty = "n")
abline(h = 0, v = 0, lty = 2, col = "gray70")
dev.off()

## ---------- Clustering ----------
cluster_idx <- split_stratified(Y, prop = min(1, 3000 / length(Y)), seed = 20260607)
X_cluster <- X_scaled[cluster_idx, ]
Y_cluster <- Y[cluster_idx]
d_cluster <- dist(X_cluster)
h_methods <- c("single", "average", "complete", "ward.D2")
h_results <- list()
for (m in h_methods) {
  hc <- hclust(d_cluster, method = m)
  cl <- cutree(hc, k = 2)
  bm <- best_cluster_metrics(Y_cluster, cl)
  h_results[[m]] <- list(hc = hc, cl = cl, best = bm)
}
h_metric_table <- do.call(rbind, lapply(names(h_results), function(m) {
  cbind(Method = m, round_df(h_results[[m]]$best$metrics, 4))
}))
write_table("Hierarchical clustering metrics", h_metric_table)

best_h <- names(h_results)[which.max(sapply(h_results, function(z) z$best$metrics$Accuracy))]
png(file.path(fig_dir, "hclust_dendrogram.png"), width = 1200, height = 850, res = 140)
plot(h_results[[best_h]]$hc, labels = FALSE, hang = -1,
     main = paste("系统聚类树状图：", best_h),
     xlab = "样本（分层抽样）", ylab = "距离")
rect.hclust(h_results[[best_h]]$hc, k = 2, border = c("#4575B4", "#D73027"))
dev.off()

k_algorithms <- c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")
k_results <- list()
for (alg in k_algorithms) {
  set.seed(20260607)
  km <- kmeans(X_scaled, centers = 2, nstart = 20, algorithm = alg, iter.max = 100)
  bm <- best_cluster_metrics(Y, km$cluster)
  k_results[[alg]] <- list(km = km, best = bm)
}
k_metric_table <- do.call(rbind, lapply(names(k_results), function(m) {
  cbind(Algorithm = m, round_df(k_results[[m]]$best$metrics, 4),
        WithinSS = round(k_results[[m]]$km$tot.withinss, 2))
}))
write_table("Kmeans clustering metrics", k_metric_table)

best_k <- names(k_results)[which.max(sapply(k_results, function(z) z$best$metrics$Accuracy))]
png(file.path(fig_dir, "kmeans_cluster_pc12.png"), width = 1200, height = 900, res = 140)
plot(pca$x[sample_plot, 1], pca$x[sample_plot, 2],
     col = ifelse(k_results[[best_k]]$best$pred[sample_plot] == 1, "#D73027AA", "#4575B4AA"),
     pch = 16, xlab = "PC1 score", ylab = "PC2 score",
     main = paste("动态聚类结果在 PC1-PC2 平面上的可视化：", best_k))
legend("topright", legend = c("聚类0", "聚类1"),
       col = c("#4575B4", "#D73027"), pch = 16, bty = "n")
abline(h = 0, v = 0, lty = 2, col = "gray70")
dev.off()

## Spectral clustering; use kernlab::specc if available, otherwise manual RBF spectral clustering.
spec_idx <- split_stratified(Y, prop = min(1, 1200 / length(Y)), seed = 20260607)
X_spec <- X_scaled[spec_idx, ]
Y_spec <- Y[spec_idx]
if (requireNamespace("kernlab", quietly = TRUE)) {
  spec_obj <- kernlab::specc(X_spec, centers = 2)
  spec_cl <- as.integer(spec_obj)
  spec_note <- "kernlab::specc"
} else {
  D <- as.matrix(dist(X_spec))
  sigma <- median(D[D > 0])
  W <- exp(-D^2 / (2 * sigma^2))
  diag(W) <- 0
  deg <- rowSums(W)
  Lsym <- diag(length(deg)) - diag(1 / sqrt(deg)) %*% W %*% diag(1 / sqrt(deg))
  eg <- eigen(Lsym, symmetric = TRUE)
  U <- eg$vectors[, (ncol(eg$vectors) - 1):ncol(eg$vectors)]
  U <- U / sqrt(rowSums(U^2))
  set.seed(20260607)
  spec_cl <- kmeans(U, centers = 2, nstart = 20)$cluster
  spec_note <- "manual normalized spectral clustering"
}
spec_best <- best_cluster_metrics(Y_spec, spec_cl)
write_table("Spectral clustering metrics", cbind(Method = spec_note, round_df(spec_best$metrics, 4)))

png(file.path(fig_dir, "spectral_cluster_pc12.png"), width = 1200, height = 900, res = 140)
plot(pca$x[spec_idx, 1], pca$x[spec_idx, 2],
     col = ifelse(spec_best$pred == 1, "#D73027AA", "#4575B4AA"),
     pch = 16, xlab = "PC1 score", ylab = "PC2 score",
     main = "谱聚类结果在 PC1-PC2 平面上的可视化")
legend("topright", legend = c("聚类0", "聚类1"),
       col = c("#4575B4", "#D73027"), pch = 16, bty = "n")
abline(h = 0, v = 0, lty = 2, col = "gray70")
dev.off()

## ---------- Discriminant analysis ----------
train70 <- split_stratified(Y, 0.7, 20260607)
disc_train <- data[train70, c(paste0("X", 1:23), "Y")]
disc_test <- data[-train70, c(paste0("X", 1:23), "Y")]
disc_train$Y <- factor(disc_train$Y, levels = c(0, 1))
disc_test$Y <- factor(disc_test$Y, levels = c(0, 1))

lda_fit <- MASS::lda(Y ~ ., data = disc_train)
lda_train_pred <- predict(lda_fit, disc_train)$class
lda_test_pred <- predict(lda_fit, disc_test)$class
lda_train_cm <- table(Actual = disc_train$Y, Predicted = lda_train_pred)
lda_test_cm <- table(Actual = disc_test$Y, Predicted = lda_test_pred)
lda_train_metrics <- confusion_metrics(as.integer(as.character(disc_train$Y)), as.integer(as.character(lda_train_pred)))
lda_test_metrics <- confusion_metrics(as.integer(as.character(disc_test$Y)), as.integer(as.character(lda_test_pred)))
write_table("LDA train confusion", lda_train_cm)
write_table("LDA train metrics", round_df(lda_train_metrics, 4))
write_table("LDA test confusion", lda_test_cm)
write_table("LDA test metrics", round_df(lda_test_metrics, 4))

lda_score <- predict(lda_fit, disc_train)$x[, 1]
png(file.path(fig_dir, "lda_projection.png"), width = 1200, height = 820, res = 140)
hist(lda_score[disc_train$Y == "0"], breaks = 60, freq = FALSE,
     col = "#4575B480", border = "#4575B4",
     xlab = "LD1", main = "线性判别分析：训练集 LD1 分布")
hist(lda_score[disc_train$Y == "1"], breaks = 60, freq = FALSE,
     col = "#D7302780", border = "#D73027", add = TRUE)
legend("topright", legend = c("未违约", "违约"),
       fill = c("#4575B480", "#D7302780"), bty = "n")
dev.off()

qda_ok <- TRUE
qda_msg <- ""
qda_fit <- tryCatch(MASS::qda(Y ~ ., data = disc_train), error = function(e) {
  qda_ok <<- FALSE
  qda_msg <<- conditionMessage(e)
  NULL
})
if (qda_ok) {
  qda_train_pred <- predict(qda_fit, disc_train)$class
  qda_test_pred <- predict(qda_fit, disc_test)$class
  qda_train_cm <- table(Actual = disc_train$Y, Predicted = qda_train_pred)
  qda_test_cm <- table(Actual = disc_test$Y, Predicted = qda_test_pred)
  qda_train_metrics <- confusion_metrics(as.integer(as.character(disc_train$Y)), as.integer(as.character(qda_train_pred)))
  qda_test_metrics <- confusion_metrics(as.integer(as.character(disc_test$Y)), as.integer(as.character(qda_test_pred)))
  write_table("QDA train confusion", qda_train_cm)
  write_table("QDA train metrics", round_df(qda_train_metrics, 4))
  write_table("QDA test confusion", qda_test_cm)
  write_table("QDA test metrics", round_df(qda_test_metrics, 4))
} else {
  cat("QDA failed:", qda_msg, "\n")
}

sink()
close(out)

cat("Analysis finished. Results: mvsa_numeric_results.txt; figures:", fig_dir, "\n")