# ------------------------------- Boxplot ------------------------------- rm(list = ls(all = TRUE)) graphics.off() library(mclust) data(banknote) str(banknote) head(banknote) x = banknote$Length summary(x) quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1)) boxplot(x) library(ggplot2) ggplot(data = banknote, aes(x = Status, y = Length)) + geom_boxplot(notch = TRUE) ggplot(data = banknote, aes(x = Status, y = Diagonal)) + geom_boxplot(notch = FALSE) # ------------------------------- Histogram ------------------------------- par(mfrow=c(2, 2)) x_0 = 137.8 h1 = 0.1 breaks.vec1 <- seq(from = x_0, by = h1, length.out = 30) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec1, xlab = 'h = 0.1', ylab = '', main = '') h2 = 0.2 breaks.vec2 <- seq(from = x_0, by = h2, length.out = 16) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec2, xlab = 'h = 0.2', ylab = '', main = '') h3 = 0.3 breaks.vec3 <- seq(from = x_0, by = h3, length.out = 11) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec3, xlab = 'h = 0.3', ylab = '', main = '') h4 = 0.4 breaks.vec4 <- seq(from = x_0, by = h4, length.out = 8) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec4, xlab = 'h = 0.4', ylab = '', main = '') par(mfrow=c(2, 2)) x_0 = 137.45 breaks.vec5 <- seq(from = x_0, by = h4, length.out = 9) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec5, xlab = 'x0 = 137.45', ylab = '', main = '') x_0 = 137.55 breaks.vec6 <- seq(from = x_0, by = h4, length.out = 9) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec6, xlab = 'x0 = 137.55', ylab = '', main = '') x_0 = 137.65 breaks.vec7 <- seq(from = x_0, by = h4, length.out = 9) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec7, xlab = 'x0 = 137.65', ylab = '', main = '') x_0 = 137.75 breaks.vec8 <- seq(from = x_0, by = h4, length.out = 9) hist(banknote$Diagonal[banknote$Status == 'counterfeit'], breaks = breaks.vec8, xlab = 'x0 = 137.75', ylab = '', main = '') # ------------------------------- Kernel Densities ------------------------------- par(mfrow=c(2, 2)) s = function(x) (3/4) * (1-x^2) h = 1/2 g = function(x) (1/h) * s(x/h) curve(s, -1, 1, xlim=c(-1.5, 1.5), ylim=c(0, 5), xlab="", ylab="Epanechnikov Kernel", col=2) lines(c(-1.5, -1), c(0, 0), col=2) lines(c(1, 1.5), c(0, 0), col=2) abline(v=0, lty=2) curve(g, -h, h, col=4, add=TRUE) lines(c(-1, -0.5), c(0, 0), col=4) lines(c(0.5, 1), c(0, 0), col=4) lines(c(-0.5, 0.5), c(0, 0), lty=2) x = c(-1.5, -0.8, -0.2, 0, 0.2) f = function(t) (3/4) * (1-t^2) h = 1/2 g1 = function(t) (1/h) * f((t-x[1])/h) g2 = function(t) (1/h) * f((t-x[2])/h) g3 = function(t) (1/h) * f((t-x[3])/h) g4 = function(t) (1/h) * f((t-x[4])/h) g5 = function(t) (1/h) * f((t-x[5])/h) curve(g1, x[1]-h, x[1]+h, xlim=c(-2, 1), ylim=c(0, 5), xlab="", ylab="", lty=1) curve(g2, x[2]-h, x[2]+h, add=TRUE, col=2, lty=1) curve(g3, x[3]-h, x[3]+h, add=TRUE, col=3, lty=1) curve(g4, x[4]-h, x[4]+h, add=TRUE, col=4, lty=1) curve(g5, x[5]-h, x[5]+h, add=TRUE, col=5, lty=1) abline(h=0, lty=2) for (i in 1:5) lines(c(x[i], x[i]), c(0, 1.5), lty=2, col=i) for (i in 1:5) points(x[i], 0, cex=0.8, pch=16, col=i) h1 = function(t) g1(t) curve(h1, -2, -1.3, xlim=c(-2, 1), ylim=c(0, 5), xlab="", ylab="") h2 = function(t) (g1(t) + g2(t)) curve(h2, -1.3, -1, add=TRUE) h3 = function(t) g2(t) curve(h3, -1, -0.7, add=TRUE) h4 = function(t) (g2(t) + g3(t)) curve(h4, -0.7, -0.5, add=TRUE) h5 = function(t) (g2(t) + g3(t) + g4(t)) curve(h5, -0.5, -0.3, add=TRUE) h6 = function(t) (g3(t) + g4(t) + g5(t)) curve(h6, -0.3, 0.3, add=TRUE) h7 = function(t) (g4(t) + g5(t)) curve(h7, 0.3, 0.5, add=TRUE) h8 = function(t) g5(t) curve(h8, 0.5, 0.7, add=TRUE) abline(h=0, lty=2) f1 = function(t) (1/5) * g1(t) curve(f1, -2, -1.3, xlim=c(-2, 1), ylim=c(0, 5), xlab="", ylab="") f2 = function(t) (1/5) * (g1(t) + g2(t)) curve(f2, -1.3, -1, add=TRUE) f3 = function(t) (1/5) * g2(t) curve(f3, -1, -0.7, add=TRUE) f4 = function(t) (1/5) * (g2(t) + g3(t)) curve(f4, -0.7, -0.5, add=TRUE) f5 = function(t) (1/5) * (g2(t) + g3(t) + g4(t)) curve(f5, -0.5, -0.3, add=TRUE) f6 = function(t) (1/5) * (g3(t) + g4(t) + g5(t)) curve(f6, -0.3, 0.3, add=TRUE) f7 = function(t) (1/5) * (g4(t) + g5(t)) curve(f7, 0.3, 0.5, add=TRUE) f8 = function(t) (1/5) * g5(t) curve(f8, 0.5, 0.7, add=TRUE) abline(h=0, lty=2) par(mfrow=c(1, 1)) f1 = function(t) (1/5) * g1(t) curve(f1, -2, -1.3, xlim=c(-2, 1), ylim=c(0, 1), xlab="", ylab="") f2 = function(t) (1/5) * (g1(t) + g2(t)) curve(f2, -1.3, -1, add=TRUE) f3 = function(t) (1/5) * g2(t) curve(f3, -1, -0.7, add=TRUE) f4 = function(t) (1/5) * (g2(t) + g3(t)) curve(f4, -0.7, -0.5, add=TRUE) f5 = function(t) (1/5) * (g2(t) + g3(t) + g4(t)) curve(f5, -0.5, -0.3, add=TRUE) f6 = function(t) (1/5) * (g3(t) + g4(t) + g5(t)) curve(f6, -0.3, 0.3, add=TRUE) f7 = function(t) (1/5) * (g4(t) + g5(t)) curve(f7, 0.3, 0.5, add=TRUE) f8 = function(t) (1/5) * g5(t) curve(f8, 0.5, 0.7, add=TRUE) abline(h=0, lty=2) for (i in 1:5) points(x[i], 0, cex=1.5, pch=21, bg='orange') ## Example: Swiss banknotes par(mfrow=c(2, 2)) plot(density(banknote$Diagonal, bw = 0.04), xlab='Bandwidth = 0.04', main='Swiss Bank Notes', ylab='Diagonal') plot(density(banknote$Diagonal, bw = 0.15), xlab='Bandwidth = 0.15', main='Swiss Bank Notes', ylab='Diagonal') plot(density(banknote$Diagonal, bw = 0.4), xlab='Bandwidth = 0.4', main='Swiss Bank Notes', ylab='Diagonal') plot(density(banknote$Diagonal, bw = 1), xlab='Bandwidth = 1', main='Swiss Bank Notes', ylab='Diagonal') library(ggplot2) ggplot(banknote, aes(Diagonal, fill = Status, color = Status)) + geom_density(alpha = 0.2) + xlim(137, 143) ggplot(banknote, aes(x = Top, y = Diagonal)) + xlim(8, 13) + ylim(137, 143) + geom_density_2d(na.rm = TRUE) # ------------------------------- Scatter plots ------------------------------- ggplot(banknote, aes(x = Top, y = Diagonal, colour = Status, shape = Status)) + xlim(7, 13) + ylim(137, 143) + geom_point(size = 2.5) + geom_abline(intercept = 136.9, slope = 1/3, colour = 'purple',size = 1.5) library(plotly) plot_ly(banknote, x =~ Bottom, y =~ Top, z =~ Diagonal) %>% add_markers(color =~ Status, symbol =~ Status) library(GGally) ggpairs(banknote, columns = 2:7, ggplot2::aes(colour=Status, alpha = 0.2)) # ------------------------------- Chernoff Faces ------------------------------- library(TeachingDemos) faces(banknote[91:110, 2:7], fill = TRUE) faces(banknote[1:100, 2:7], ncol=10, fill = TRUE) faces(banknote[101:200, 2:7], ncol=10, fill = TRUE) # library(aplpack) # faces(banknote[91:110,], face.type = 1) # ------------------------------- Andrews’ Curves ------------------------------- library(pracma) A1 = as.matrix(banknote[96:105, 2:7]) andrewsplot(scale(A1), f = banknote$Status[96:105], style = 'cart') A2 = as.matrix(banknote[96:105, 7:2]) andrewsplot(scale(A2), f = banknote$Status[96:105], style = 'cart') andrewsplot(scale(A1), f = banknote$Status[96:105], style = 'pol') andrewsplot(scale(A2), f = banknote$Status[96:105], style = 'pol') A3 = as.matrix(banknote[1:200, 2:7]) andrewsplot(scale(A3), f = banknote$Status[1:200], style = 'cart') andrewsplot(scale(A3), f = banknote$Status[1:200], style = 'pol') A4 = as.matrix(banknote[1:200, 7:2]) andrewsplot(scale(A4), f = banknote$Status[1:200], style = 'cart') andrewsplot(scale(A4), f = banknote$Status[1:200], style = 'pol') # ------------------------------- Parallel Coordinate Plots ------------------------------- library(GGally) banknote.sub = banknote[96:105,] ggparcoord(data = banknote.sub, columns = 2:7, groupColumn = 1, scale = 'uniminmax') ggparcoord(data = banknote.sub, columns = 7:2, groupColumn = 1, scale = 'uniminmax') ggparcoord(data = banknote, columns = 2:7, groupColumn = 1, scale = 'uniminmax') ggparcoord(data = banknote, columns = 7:2, groupColumn = 1, scale = 'uniminmax') # ------------------------------- Hexagon Plots ------------------------------- library(ggplot2) ggplot(data = banknote, aes(x = Diagonal, y = Top, z = Length)) + stat_summary_hex() ?diamonds ggplot(data = diamonds, aes(x = depth, y = carat, z = price)) + stat_summary_hex() # ------------------------------- Boston Housing ------------------------------- library(MASS) str(Boston) head(Boston) # PCP (平行坐标图) library(GGally) c = Boston$medv > median(Boston$medv) # 按 X_14 是否大于其中位数分为两类 boston = cbind(Boston, c) ggparcoord(data = boston, columns = 1:14, groupColumn = 15, scale = 'uniminmax') # 全部14个变量的散点图矩阵 ggpairs(boston, columns = 1:14, ggplot2::aes(colour = c, alpha = 0.2)) # 变量 4, 6, 11, 12, 13, 14 的散点图矩阵 ggpairs(boston, columns = c(4, 6, 11, 12, 13, 14), ggplot2::aes(colour = c, alpha = 0.2)) # 变量 1:5, 14 的散点图矩阵 ggpairs(boston, columns = c(1:5, 14), ggplot2::aes(colour = c, alpha = 0.2)) library(magrittr) library(gridExtra) # ----------------- 变量 X_1 ----------------- # 变量 X_1 的箱线图 Box_X1 = ggplot(data = boston, aes(y = crim)) + geom_boxplot(notch = FALSE) # 变量 X_1 取对数之后的箱线图 Box_Log_X1 = ggplot(data = boston, aes(y = log(crim))) + geom_boxplot(notch = FALSE) grid.arrange(Box_X1, Box_Log_X1, ncol = 2) # 变量 X_1 依房价的分类箱线图 Boxc_X1 = ggplot(data = boston, aes(x = c, y = crim)) + geom_boxplot(notch = FALSE) # 变量 X_1 取对数之后的分类箱线图 Boxc_Log_X1 = ggplot(data = boston, aes(x = c, y = log(crim))) + geom_boxplot(notch = FALSE) grid.arrange(Boxc_X1, Boxc_Log_X1, ncol = 2) # 对数变换之后依房价的分类核密度图 ggplot(boston, aes(log(crim), fill = c, color = c)) + geom_density(alpha = 0.2) # 变量 1:5, 14 变换之后的散点图矩阵 Log_X1 = log(Boston$crim) X2 = Boston$zn / 10 Log_X3 = log(Boston$indus) Log_X5 = log(Boston$nox) Log_X14 = log(Boston$medv) boston_sub = data.frame(crim = Log_X1, zn = X2, indus = Log_X3, chas = Boston$chas, nox = Log_X5, medv = Log_X14, c = boston$c) ggpairs(boston_sub, columns = 1:6, ggplot2::aes(colour = c, alpha = 0.2)) # 变量 log(X_1) 与 log(X_14) 依房价分类的散点图 ggplot(boston_sub, aes(x = crim, y = medv, color = c)) + geom_point() + xlab("log(crim)") + ylab("log(medv)") # ----------------- 变量 X_2 ----------------- # 变量1:5, 14的散点图矩阵 ggpairs(boston, columns = c(1:5, 14), ggplot2::aes(colour = c, alpha = 0.2)) # 变量2的对数依变量14(房价)的对数的分类箱线图 ggplot(data = boston_sub, aes(x = c, y = zn)) + geom_boxplot(notch = FALSE) # ----------------- 变量 X_3 ----------------- # 变量3与变量14的平行坐标图 ggparcoord(data = boston, columns = 1:14, groupColumn = 15, scale = 'uniminmax') # 变量3与变量14的散点图 ggplot(boston, aes(x = indus, y = medv, color = c)) + geom_point() # 变量3与变量14经对数变换之后的散点图 ggplot(boston, aes(x = log(indus), y = log(medv), color = c)) + geom_point() # 变量3的核密度图 ggplot(boston, aes(indus)) + geom_density(linewidth = 1, colour = 'steelblue') # ----------------- 变量 X_4 ----------------- # Boston房屋数据集的PCP (平行坐标图) ggparcoord(data = boston, columns = 1:14, groupColumn = 15, scale = 'uniminmax') # 变量 1:5, 14 的散点图矩阵 ggpairs(boston, columns = c(1:5, 14), ggplot2::aes(colour = c, alpha = 0.2)) # 校正后的数据集 library(mlbench) data(BostonHousing2) head(BostonHousing2) subset(BostonHousing2, chas == 1) # ----------------- 变量 X_5 ----------------- # 变量5与变量14的散点图 ggplot(boston, aes(x = nox, y = medv, color = c)) + geom_point() # 变量5依房价的分类箱线图 ggplot(data = boston, aes(x = c, y = nox)) + geom_boxplot(notch = FALSE) # ----------------- 变量 X_6 ----------------- # 变量6与变量14的散点图 ggplot(boston, aes(x = rm, y = medv, color = c)) + geom_point() # 变量6依房价的分类箱线图 ggplot(data = boston, aes(x = c, y = rm)) + geom_boxplot(notch = FALSE) # ----------------- 变量 X_7 ----------------- # 变量7与变量14的散点图 ggplot(boston, aes(x = age, y = medv, color = c)) + geom_point() # 变量7依房价的分类箱线图 ggplot(data = boston, aes(x = c, y = age)) + geom_boxplot(notch = TRUE, fill = "cyan", colour = "orangered2") # 变量7变换后的核密度图 Y7 = Boston$age^(2.5) / 10000 boston_sub = cbind(boston_sub, age_new = Y7) ggplot(boston_sub, aes(age_new)) + geom_density(linewidth = 1, colour = 'steelblue') # 变量7与8的散点图 ggplot(boston, aes(x = age, y = dis, color = c)) + geom_point() # ----------------- 变量 X_8 ----------------- # 变量8与变量14的散点图 ggplot(boston, aes(x = dis, y = medv, color = c)) + geom_point() # 变量8依房价的分类箱线图 ggplot(data = boston, aes(x = c, y = dis)) + geom_boxplot(notch = TRUE, fill = "cyan", colour = "orangered2") # ----------------- 变量 X_9 ----------------- # 变量9与14的散点图 p9_1 = ggplot(boston, aes(x = rad, y = medv)) + geom_point() # 变量9直方图 p9_2 = ggplot(boston, aes(x = rad)) + geom_histogram(binwidth = 1) # 变量9的核密度图 p9_3 = ggplot(boston, aes(x = rad)) + geom_density() grid.arrange(p9_1, p9_2, p9_3, ncol = 3) # 变量9与变量14的散点图 p9_4 = ggplot(boston, aes(x = rad, y = medv, color = c)) + geom_point() # 变量9依变量14的箱线图 p9_5 = ggplot(boston, aes(x = c, y = rad)) + geom_boxplot(notch = FALSE, fill = "cyan", colour = "orangered2") # 变量9依变量14的分类核密度图 p9_6 = ggplot(boston, aes(x = rad, fill = c, colour = c)) + geom_density(alpha = 0.2) grid.arrange(p9_4, p9_5, p9_6, ncol = 3) # ----------------- 变量 X_10 ----------------- # 变量10与14的散点图 p10_1 = ggplot(boston, aes(x = tax, y = medv)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.85, linewidth = 1.5, colour = 'red3') # 变量10的核密度图 p10_2 = ggplot(boston, aes(x = tax)) + geom_density() grid.arrange(p10_1, p10_2, ncol = 2) # 变量10依变量14的箱线图 p10_3 = ggplot(boston, aes(x = c, y = tax)) + geom_boxplot(notch = FALSE, fill = "cyan", colour = "orangered2") grid.arrange(p10_1, p10_3, ncol = 2) # ----------------- 变量 X_11 ----------------- # 变量11依变量14的箱线图 p11_1 = ggplot(boston, aes(x = c, y = ptratio)) + geom_boxplot(notch = FALSE, fill = "cyan", colour = "orangered2") p11_1 # 变量11依14的分类散点图 p11_2 = ggplot(boston, aes(x = ptratio, y = medv, colour = c)) + geom_point() + geom_smooth(method = 'lm', linewidth = 1.5, colour = 'blue', se = FALSE) p11_2 # ----------------- 变量 X_12 ----------------- # 变量12与3,7,11,14的散点图矩阵 p12_3 = ggplot(boston, aes(x = black, y = indus)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') p12_7 = ggplot(boston, aes(x = black, y = age)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') p12_11 = ggplot(boston, aes(x = black, y = ptratio)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') p12_14 = ggplot(boston, aes(x = black, y = medv)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') grid.arrange(p12_3, p12_7, p12_11, p12_14, ncol = 2) # 变量12的箱线图、描述性统计量值 ggplot(boston, aes(y = black)) + geom_boxplot(notch = FALSE, fill = "white", colour = "steelblue") summary(boston$black) # 变量12取值位于上、下四分位数之外的个数 dim(boston) sum(boston$black < 375) sum(boston$black > 397) # 观测数据405至470的变量12的取值 boston$black[405:470] # 变量12对数值的散点图 black_log = log(boston$black) boston = cbind(boston, black_log = black_log) p12_log = ggplot(boston, aes(x = black_log, y = medv)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') p12_medv = ggplot(boston, aes(x = black, y = medv)) + geom_point() grid.arrange(p12_log, p12_medv, ncol = 2) # 变量12与3的散点图 ggplot(boston, aes(x = black, y = indus)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') # 变量12与7的散点图 ggplot(boston, aes(x = black, y = age)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') # 变量12与11的散点图 ggplot(boston, aes(x = black, y = ptratio)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') # 变量12与14的散点图 ggplot(boston, aes(x = black, y = medv, colour = c)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.95, linewidth = 1.5, colour = 'red3') # ----------------- 变量 X_13 ----------------- # 变量13与14的散点图 p13_1 = ggplot(boston, aes(x = lstat, y = medv, colour = c)) + geom_point() + geom_smooth(method = 'loess', se = FALSE, span = 0.8, linewidth = 1.5, colour = 'blue') # 变量13的箱线图 p13_2 = ggplot(boston, aes(y = lstat)) + geom_boxplot(notch = TRUE, fill = "white", colour = "blue") grid.arrange(p13_1, p13_2, ncol = 2) # 变量13开平方根与14取对数的散点图 lstat_root = sqrt(boston$lstat) medv_log = log(boston$medv) boston = cbind(boston, lstat_root = lstat_root, medv_log = medv_log) ggplot(boston, aes(x = lstat_root, y = medv_log, colour = c)) + geom_point() + geom_smooth(method = 'lm', se = FALSE, linewidth = 1.5, colour = 'blue') # ----------------- 变量变换 ----------------- trans_X1 = log(boston$crim) trans_X2 = boston$zn / 10 trans_X3 = log(boston$indus) trans_X5 = log(boston$nox) trans_X6 = log(boston$rm) trans_X7 = boston$age^(2.5) / 10000 trans_X8 = log(boston$dis) trans_X9 = log(boston$rad) trans_X10 = log(boston$tax) trans_X11 = (exp(0.4 * boston$ptratio)) / 1000 trans_X12 = boston$black / 100 trans_X13 = sqrt(boston$lstat) trans_X14 = log(boston$medv) trans_boston = data.frame(trans_X1, trans_X2, trans_X3, boston$chas, trans_X5, trans_X6, trans_X7, trans_X8, trans_X9, trans_X10, trans_X11, trans_X12, trans_X13, trans_X14) names(trans_boston) = names(Boston) par(mfrow = c(2, 1)) boxplot(Boston, main = 'Boston Housing Data', axes = FALSE, boxwex = 0.5) axis(2) boxplot(trans_boston, main = 'Transformed Boston Housing Data', axes = FALSE, boxwex = 0.5) axis(2)