rm(list = ls(all = TRUE)) graphics.off() set.seed(20260607) options(digits = 4) data_file <- "Default of Credit Card Clients.csv" fig_dir <- "mvsa_figures" dir.create(fig_dir, showWarnings = FALSE) ## ---------- utility functions ---------- split_stratified <- function(y, prop = 0.8, seed = 1) { set.seed(seed) idx <- unlist(lapply(split(seq_along(y), y), function(ind) { sample(ind, floor(length(ind) * prop)) })) sort(idx) } confusion_metrics <- function(actual, pred, positive = 1) { actual <- as.integer(actual) pred <- as.integer(pred) tab <- table( Actual = factor(actual, levels = c(0, 1)), Predicted = factor(pred, levels = c(0, 1)) ) acc <- sum(diag(tab)) / sum(tab) sens <- tab["1", "1"] / sum(tab["1", ]) spec <- tab["0", "0"] / sum(tab["0", ]) precision <- ifelse(sum(tab[, "1"]) == 0, NA, tab["1", "1"] / sum(tab[, "1"])) data.frame( Accuracy = acc, Sensitivity = sens, Specificity = spec, Precision = precision, ErrorRate = 1 - acc ) } best_cluster_metrics <- function(actual, cluster) { cluster <- as.integer(as.factor(cluster)) pred0 <- cluster - 1 m0 <- confusion_metrics(actual, pred0) m1 <- confusion_metrics(actual, 1 - pred0) if (m0$Accuracy >= m1$Accuracy) { list(pred = pred0, metrics = m0, table = table(Actual = actual, Cluster = pred0)) } else { list(pred = 1 - pred0, metrics = m1, table = table(Actual = actual, Cluster = 1 - pred0)) } } auc_rank <- function(y, score) { y <- as.integer(y) r <- rank(score, ties.method = "average") n1 <- sum(y == 1) n0 <- sum(y == 0) (sum(r[y == 1]) - n1 * (n1 + 1) / 2) / (n1 * n0) } round_df <- function(x, digits = 4) { x <- as.data.frame(x) num <- vapply(x, is.numeric, logical(1)) x[num] <- lapply(x[num], round, digits) x } write_table <- function(name, x) { cat("\n====", name, "====\n") print(x) } ## ---------- read and preprocess data ---------- data <- read.csv(data_file, check.names = FALSE) X <- data[, paste0("X", 1:23)] Y <- data$Y X_scaled <- scale(X) y_factor <- factor(Y, levels = c(0, 1), labels = c("NoDefault", "Default")) out <- file("mvsa_numeric_results.txt", open = "wt", encoding = "UTF-8") sink(out) cat("Data dimension:", paste(dim(data), collapse = " x "), "\n") cat("Missing values:", sum(is.na(data)), "\n") cat("Y table:\n") print(table(Y)) cat("Y proportion:\n") print(round(prop.table(table(Y)), 4)) desc_selected <- data.frame( Variable = names(X), Mean = sapply(X, mean), SD = sapply(X, sd), Min = sapply(X, min), Median = sapply(X, median), Max = sapply(X, max) ) write_table("Descriptive statistics", round_df(desc_selected, 2)) ## ---------- descriptive plots ---------- png(file.path(fig_dir, "desc_y_bar.png"), width = 1100, height = 760, res = 140) barplot(table(Y), col = c("#80B1D3", "#FB8072"), names.arg = c("未违约(0)", "违约(1)"), main = "响应变量 Y 的类别分布", ylab = "人数") text(x = c(0.7, 1.9), y = as.numeric(table(Y)) + 500, labels = paste0(as.numeric(table(Y)), "人"), cex = 1.1) dev.off() png(file.path(fig_dir, "desc_boxplot_keyvars.png"), width = 1200, height = 820, res = 140) par(mfrow = c(2, 2), mar = c(4, 4, 3, 1)) boxplot(X1 ~ Y, data = data, col = c("#B3DE69", "#FDB462"), names = c("未违约", "违约"), main = "授信额度 X1", ylab = "新台币") boxplot(X5 ~ Y, data = data, col = c("#B3DE69", "#FDB462"), names = c("未违约", "违约"), main = "年龄 X5", ylab = "岁") boxplot(X6 ~ Y, data = data, col = c("#B3DE69", "#FDB462"), names = c("未违约", "违约"), main = "9月还款状态 X6", ylab = "状态") boxplot(X18 ~ Y, data = data, col = c("#B3DE69", "#FDB462"), names = c("未违约", "违约"), main = "9月还款金额 X18", ylab = "新台币") dev.off() ## ---------- Logistic regression ---------- train80 <- split_stratified(Y, 0.8, 20260607) train_log <- data[train80, c(paste0("X", 1:23), "Y")] test_log <- data[-train80, c(paste0("X", 1:23), "Y")] log_fit <- glm(Y ~ ., data = train_log, family = binomial(), control = glm.control(maxit = 100)) log_prob <- predict(log_fit, newdata = test_log, type = "response") log_pred <- ifelse(log_prob >= 0.5, 1, 0) log_cm <- table( Actual = factor(test_log$Y, levels = c(0, 1)), Predicted = factor(log_pred, levels = c(0, 1)) ) log_metrics <- confusion_metrics(test_log$Y, log_pred) log_auc <- auc_rank(test_log$Y, log_prob) coef_tab <- summary(log_fit)$coefficients coef_key <- coef_tab[-1, , drop = FALSE] top_coef <- coef_key[order(abs(coef_key[, "z value"]), decreasing = TRUE)[1:8], ] write_table("Logistic confusion matrix", log_cm) write_table("Logistic metrics", round_df(log_metrics, 4)) cat("Logistic AUC:", round(log_auc, 4), "\n") write_table("Top logistic coefficients by |z|", round_df(top_coef, 4)) ## ---------- PCA ---------- pca <- prcomp(X_scaled, center = FALSE, scale. = FALSE) eig <- pca$sdev^2 pca_prop <- eig / sum(eig) pca_cum <- cumsum(pca_prop) k_eig1 <- sum(eig > 1) k_80 <- which(pca_cum >= 0.80)[1] pca_table <- data.frame( PC = paste0("PC", 1:10), Eigenvalue = eig[1:10], Proportion = pca_prop[1:10], Cumulative = pca_cum[1:10] ) write_table("PCA variance table", round_df(pca_table, 4)) cat("PCA k eigenvalue>1:", k_eig1, "\n") cat("PCA k cumulative>=80%:", k_80, "\n") pca_cor12 <- sweep(pca$rotation[, 1:2], 2, pca$sdev[1:2], "*") colnames(pca_cor12) <- c("PC1", "PC2") pca_top <- data.frame( PC1_positive = names(sort(pca_cor12[, 1], decreasing = TRUE))[1:5], PC1_negative = names(sort(pca_cor12[, 1], decreasing = FALSE))[1:5], PC2_positive = names(sort(pca_cor12[, 2], decreasing = TRUE))[1:5], PC2_negative = names(sort(pca_cor12[, 2], decreasing = FALSE))[1:5] ) write_table("PCA top variables", pca_top) png(file.path(fig_dir, "pca_scree.png"), width = 1200, height = 820, res = 140) par(mar = c(5, 5, 4, 5)) barplot(pca_prop, col = "#9ECAE1", border = "#3182BD", names.arg = 1:length(pca_prop), xlab = "主成分序号", ylab = "方差贡献率", main = "PCA 碎石图与累计贡献率") lines(seq_along(pca_cum), pca_cum, type = "b", pch = 19, col = "#E6550D") axis(4, at = seq(0, 1, 0.2), labels = seq(0, 1, 0.2)) mtext("累计贡献率", side = 4, line = 3) abline(h = 0.8, lty = 2, col = "gray40") dev.off() png(file.path(fig_dir, "pca_variable_pc12.png"), width = 1000, height = 1000, res = 140) plot(cos(seq(0, 2*pi, length.out = 361)), sin(seq(0, 2*pi, length.out = 361)), type = "l", asp = 1, xlab = "PC1", ylab = "PC2", main = "变量在前两个主成分平面上的投影", col = "#3182BD", xlim = c(-1, 1), ylim = c(-1, 1)) abline(h = 0, v = 0, lty = 2, col = "gray60") points(pca_cor12[, 1], pca_cor12[, 2], pch = 19, col = "#E6550D") text(pca_cor12[, 1], pca_cor12[, 2], labels = rownames(pca_cor12), pos = 3, cex = 0.8) dev.off() sample_plot <- split_stratified(Y, prop = min(1, 5000 / length(Y)), seed = 20260607) png(file.path(fig_dir, "pca_observation_pc12.png"), width = 1200, height = 900, res = 140) plot(pca$x[sample_plot, 1], pca$x[sample_plot, 2], col = ifelse(Y[sample_plot] == 1, "#D73027AA", "#4575B4AA"), pch = 16, xlab = "PC1 score", ylab = "PC2 score", main = "观测在前两个主成分平面上的散点图") legend("topright", legend = c("未违约", "违约"), col = c("#4575B4", "#D73027"), pch = 16, bty = "n") abline(h = 0, v = 0, lty = 2, col = "gray70") dev.off() ## ---------- Factor analysis ---------- fa_k <- k_eig1 fa_fit <- factanal(X_scaled, factors = fa_k, rotation = "varimax", scores = "regression") fa_load <- as.matrix(fa_fit$loadings) fa_comm <- rowSums(fa_load^2) fa_var <- colSums(fa_load^2) / ncol(X) fa_table <- data.frame( Variable = rownames(fa_load), round(fa_load[, 1:min(6, ncol(fa_load)), drop = FALSE], 3), Communality = round(fa_comm, 3), Uniqueness = round(fa_fit$uniquenesses, 3) ) write_table("Factor analysis loadings", fa_table) cat("Factor variance proportions:\n") print(round(fa_var, 4)) cat("Factor cumulative variance:", round(cumsum(fa_var), 4), "\n") fa_interpret <- lapply(1:ncol(fa_load), function(j) { v <- names(sort(abs(fa_load[, j]), decreasing = TRUE))[1:5] paste(v, collapse = ", ") }) names(fa_interpret) <- paste0("Factor", 1:ncol(fa_load)) write_table("Factor top variables", as.data.frame(fa_interpret)) png(file.path(fig_dir, "fa_variable_factor12.png"), width = 1000, height = 900, res = 140) plot(fa_load[, 1], fa_load[, 2], type = "n", xlab = "Factor 1 loading", ylab = "Factor 2 loading", main = "变量在前两个公共因子平面上的投影", xlim = c(-1, 1), ylim = c(-1, 1), asp = 1) abline(h = 0, v = 0, lty = 2, col = "gray60") points(fa_load[, 1], fa_load[, 2], pch = 19, col = "#756BB1") text(fa_load[, 1], fa_load[, 2], rownames(fa_load), pos = 3, cex = 0.8) dev.off() fa_scores <- fa_fit$scores png(file.path(fig_dir, "fa_score_factor12.png"), width = 1200, height = 900, res = 140) plot(fa_scores[sample_plot, 1], fa_scores[sample_plot, 2], col = ifelse(Y[sample_plot] == 1, "#D73027AA", "#4575B4AA"), pch = 16, xlab = "Factor 1 score", ylab = "Factor 2 score", main = "观测在前两个公共因子得分平面上的散点图") legend("topright", legend = c("未违约", "违约"), col = c("#4575B4", "#D73027"), pch = 16, bty = "n") abline(h = 0, v = 0, lty = 2, col = "gray70") dev.off() ## ---------- Clustering ---------- cluster_idx <- split_stratified(Y, prop = min(1, 3000 / length(Y)), seed = 20260607) X_cluster <- X_scaled[cluster_idx, ] Y_cluster <- Y[cluster_idx] d_cluster <- dist(X_cluster) h_methods <- c("single", "average", "complete", "ward.D2") h_results <- list() for (m in h_methods) { hc <- hclust(d_cluster, method = m) cl <- cutree(hc, k = 2) bm <- best_cluster_metrics(Y_cluster, cl) h_results[[m]] <- list(hc = hc, cl = cl, best = bm) } h_metric_table <- do.call(rbind, lapply(names(h_results), function(m) { cbind(Method = m, round_df(h_results[[m]]$best$metrics, 4)) })) write_table("Hierarchical clustering metrics", h_metric_table) best_h <- names(h_results)[which.max(sapply(h_results, function(z) z$best$metrics$Accuracy))] png(file.path(fig_dir, "hclust_dendrogram.png"), width = 1200, height = 850, res = 140) plot(h_results[[best_h]]$hc, labels = FALSE, hang = -1, main = paste("系统聚类树状图:", best_h), xlab = "样本(分层抽样)", ylab = "距离") rect.hclust(h_results[[best_h]]$hc, k = 2, border = c("#4575B4", "#D73027")) dev.off() k_algorithms <- c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen") k_results <- list() for (alg in k_algorithms) { set.seed(20260607) km <- kmeans(X_scaled, centers = 2, nstart = 20, algorithm = alg, iter.max = 100) bm <- best_cluster_metrics(Y, km$cluster) k_results[[alg]] <- list(km = km, best = bm) } k_metric_table <- do.call(rbind, lapply(names(k_results), function(m) { cbind(Algorithm = m, round_df(k_results[[m]]$best$metrics, 4), WithinSS = round(k_results[[m]]$km$tot.withinss, 2)) })) write_table("Kmeans clustering metrics", k_metric_table) best_k <- names(k_results)[which.max(sapply(k_results, function(z) z$best$metrics$Accuracy))] png(file.path(fig_dir, "kmeans_cluster_pc12.png"), width = 1200, height = 900, res = 140) plot(pca$x[sample_plot, 1], pca$x[sample_plot, 2], col = ifelse(k_results[[best_k]]$best$pred[sample_plot] == 1, "#D73027AA", "#4575B4AA"), pch = 16, xlab = "PC1 score", ylab = "PC2 score", main = paste("动态聚类结果在 PC1-PC2 平面上的可视化:", best_k)) legend("topright", legend = c("聚类0", "聚类1"), col = c("#4575B4", "#D73027"), pch = 16, bty = "n") abline(h = 0, v = 0, lty = 2, col = "gray70") dev.off() ## Spectral clustering; use kernlab::specc if available, otherwise manual RBF spectral clustering. spec_idx <- split_stratified(Y, prop = min(1, 1200 / length(Y)), seed = 20260607) X_spec <- X_scaled[spec_idx, ] Y_spec <- Y[spec_idx] if (requireNamespace("kernlab", quietly = TRUE)) { spec_obj <- kernlab::specc(X_spec, centers = 2) spec_cl <- as.integer(spec_obj) spec_note <- "kernlab::specc" } else { D <- as.matrix(dist(X_spec)) sigma <- median(D[D > 0]) W <- exp(-D^2 / (2 * sigma^2)) diag(W) <- 0 deg <- rowSums(W) Lsym <- diag(length(deg)) - diag(1 / sqrt(deg)) %*% W %*% diag(1 / sqrt(deg)) eg <- eigen(Lsym, symmetric = TRUE) U <- eg$vectors[, (ncol(eg$vectors) - 1):ncol(eg$vectors)] U <- U / sqrt(rowSums(U^2)) set.seed(20260607) spec_cl <- kmeans(U, centers = 2, nstart = 20)$cluster spec_note <- "manual normalized spectral clustering" } spec_best <- best_cluster_metrics(Y_spec, spec_cl) write_table("Spectral clustering metrics", cbind(Method = spec_note, round_df(spec_best$metrics, 4))) png(file.path(fig_dir, "spectral_cluster_pc12.png"), width = 1200, height = 900, res = 140) plot(pca$x[spec_idx, 1], pca$x[spec_idx, 2], col = ifelse(spec_best$pred == 1, "#D73027AA", "#4575B4AA"), pch = 16, xlab = "PC1 score", ylab = "PC2 score", main = "谱聚类结果在 PC1-PC2 平面上的可视化") legend("topright", legend = c("聚类0", "聚类1"), col = c("#4575B4", "#D73027"), pch = 16, bty = "n") abline(h = 0, v = 0, lty = 2, col = "gray70") dev.off() ## ---------- Discriminant analysis ---------- train70 <- split_stratified(Y, 0.7, 20260607) disc_train <- data[train70, c(paste0("X", 1:23), "Y")] disc_test <- data[-train70, c(paste0("X", 1:23), "Y")] disc_train$Y <- factor(disc_train$Y, levels = c(0, 1)) disc_test$Y <- factor(disc_test$Y, levels = c(0, 1)) lda_fit <- MASS::lda(Y ~ ., data = disc_train) lda_train_pred <- predict(lda_fit, disc_train)$class lda_test_pred <- predict(lda_fit, disc_test)$class lda_train_cm <- table(Actual = disc_train$Y, Predicted = lda_train_pred) lda_test_cm <- table(Actual = disc_test$Y, Predicted = lda_test_pred) lda_train_metrics <- confusion_metrics(as.integer(as.character(disc_train$Y)), as.integer(as.character(lda_train_pred))) lda_test_metrics <- confusion_metrics(as.integer(as.character(disc_test$Y)), as.integer(as.character(lda_test_pred))) write_table("LDA train confusion", lda_train_cm) write_table("LDA train metrics", round_df(lda_train_metrics, 4)) write_table("LDA test confusion", lda_test_cm) write_table("LDA test metrics", round_df(lda_test_metrics, 4)) lda_score <- predict(lda_fit, disc_train)$x[, 1] png(file.path(fig_dir, "lda_projection.png"), width = 1200, height = 820, res = 140) hist(lda_score[disc_train$Y == "0"], breaks = 60, freq = FALSE, col = "#4575B480", border = "#4575B4", xlab = "LD1", main = "线性判别分析:训练集 LD1 分布") hist(lda_score[disc_train$Y == "1"], breaks = 60, freq = FALSE, col = "#D7302780", border = "#D73027", add = TRUE) legend("topright", legend = c("未违约", "违约"), fill = c("#4575B480", "#D7302780"), bty = "n") dev.off() qda_ok <- TRUE qda_msg <- "" qda_fit <- tryCatch(MASS::qda(Y ~ ., data = disc_train), error = function(e) { qda_ok <<- FALSE qda_msg <<- conditionMessage(e) NULL }) if (qda_ok) { qda_train_pred <- predict(qda_fit, disc_train)$class qda_test_pred <- predict(qda_fit, disc_test)$class qda_train_cm <- table(Actual = disc_train$Y, Predicted = qda_train_pred) qda_test_cm <- table(Actual = disc_test$Y, Predicted = qda_test_pred) qda_train_metrics <- confusion_metrics(as.integer(as.character(disc_train$Y)), as.integer(as.character(qda_train_pred))) qda_test_metrics <- confusion_metrics(as.integer(as.character(disc_test$Y)), as.integer(as.character(qda_test_pred))) write_table("QDA train confusion", qda_train_cm) write_table("QDA train metrics", round_df(qda_train_metrics, 4)) write_table("QDA test confusion", qda_test_cm) write_table("QDA test metrics", round_df(qda_test_metrics, 4)) } else { cat("QDA failed:", qda_msg, "\n") } sink() close(out) cat("Analysis finished. Results: mvsa_numeric_results.txt; figures:", fig_dir, "\n")