前 言

核密度估计图 (kernel density plot)用干显示数据在X轴连续数据段内的分布状况。这种图表是直方图的变种,使用平滑曲线来绘制水平数值,从而得出更平滑的分布。核密度估计图比直方图优胜的地方,在于它们不受所使用分组数量的影响,所以能更好地界定分布形状。核密度估计(kernel density estimation)是在概率论中用来估计未知的密度函数,属于非参数检验方法之一,由Rosenblatt(1955)和Emanuel Parzen( 1962) 提出,又名 Parzen窗(Parzen window ). 所谓核密度估计,就是采用平滑的峰值函数(核)来拟合观察到的数据点,从而对真实的概率分布曲线进行模拟。核密度估计,是一种用于估计概率密度函数的非参数方法


一般密度图我们使用ggplot2 中的 geom_density()函数就可以实现,另外我们可以利用核密度估计的思想绘制山峦图,其实也就是数据的组合,使用 ggridges 软件包,下载安装如下:





ggplot(diamonds, aes(carat)) + geom_density() + theme_bw()


ggplot(diamonds, aes(depth, colour = cut)) + geom_density() + theme_bw() + xlim(55,70)


ggplot(diamonds, aes(depth, fill = cut, colour = cut)) + geom_density(alpha = 0.1) +theme_bw()+xlim(55, 70)




# set the `rel_min_height` argument to remove tails
ggplot(iris, aes(x = Sepal.Length, y = Species)) + geom_density_ridges(rel_min_height = 0.005) +scale_y_discrete(expand = c(0.01, 0)) + scale_x_continuous(expand = c(0.01, 0)) +theme_ridges()


# set the `scale` to determine how much overlap there is among the plots
ggplot(diamonds, aes(x = price, y = cut)) + geom_density_ridges(scale = 4) + scale_y_discrete(expand = c(0.01,0)) + scale_x_continuous(expand = c(0.01, 0)) + theme_ridges()


# the same figure with colors, and using the ggplot2 density stat
ggplot(diamonds, aes(x = price, y = cut, fill = cut, height = ..density..)) + geom_density_ridges(scale = 4,stat = "density") + scale_y_discrete(expand = c(0.01, 0)) + scale_x_continuous(expand = c(0.01,0)) + scale_fill_brewer(palette = 4) + theme_ridges() + theme(legend.position = "none")


# use geom_density_ridges2() instead of geom_density_ridges() for solid
# polygons
ggplot(iris, aes(x = Sepal.Length, y = Species)) + geom_density_ridges2() + scale_y_discrete(expand = c(0.01,0)) + scale_x_continuous(expand = c(0.01, 0)) + theme_ridges()


ggplot(diamonds, aes(x = price, y = cut)) +geom_density_ridges(scale = 4) + scale_y_discrete(expand = c(0, 0)) +     # will generally have to set the `expand` optionscale_x_continuous(expand = c(0, 0)) +   # for both axes to remove unneeded paddingcoord_cartesian(clip = "off") + # to avoid clipping of the very top of the top ridgelinetheme_ridges()


# > Picking joint bandwidth of 458
ggplot(lincoln_weather, aes(x = `Mean Temperature [F]`, y = Month, fill = ..density..)) +geom_density_ridges_gradient(scale = 3, rel_min_height = 0, size = 0.3) + scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"Spectral")))(32))


ggplot(lincoln_weather, aes(x = `Mean Temperature [F]`, y = Month, fill = stat(x))) +geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) + scale_fill_viridis_c(name = "Temp. [F]",option = "C") + labs(title = "Temperatures in Lincoln NE in 2016")


# Quantile lines and coloring by quantiles or probabilities
ggplot(iris, aes(x = Sepal.Length, y = Species)) + stat_density_ridges(quantile_lines = TRUE)


ggplot(iris, aes(x = Sepal.Length, y = Species)) + stat_density_ridges(quantile_lines = TRUE,quantiles = c(0.025, 0.975), alpha = 0.7)

使用geom geom_density_ridges_gradient,我们也可以通过计算的统计(分位数)美学按分位数着色。注意,只有当calc_ecdf = TRUE时才会计算此美观值。

ggplot(iris, aes(x = Sepal.Length, y = Species, fill = factor(stat(quantile)))) +stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE, quantiles = 4,quantile_lines = TRUE) + scale_fill_viridis_d(name = "Quartiles")


ggplot(iris, aes(x = Sepal.Length, y = Species, fill = factor(stat(quantile)))) +stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE, quantiles = c(0.025,0.975)) + scale_fill_manual(name = "Probability", values = c("#FF0000A0","#A0A0A0A0", "#0000FFA0"), labels = c("(0, 0.025]", "(0.025, 0.975]", "(0.975, 1]"))

最后,当calc_ecdf = TRUE时,我们还可以获得一个计算的美学属性(ecdf),它表示分布的经验累积密度函数。这使得我们可以将概率直接映射到颜色上。

ggplot(iris, aes(x = Sepal.Length, y = Species, fill = 0.5 - abs(0.5 - stat(ecdf)))) +stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE) + scale_fill_viridis_c(name = "Tail probability",direction = -1)

增加抖动点:stat_density_ridges还提供了可视化生成分布的原始数据点的选项。这可以通过在geom_density_ridges或geom_density_ridges中设置jittered_points = TRUE来实现:

ggplot(iris, aes(x = Sepal.Length, y = Species)) + geom_density_ridges(jittered_points = TRUE)


ggplot(iris, aes(x = Sepal.Length, y = Species)) + geom_density_ridges(jittered_points = TRUE,position = "raincloud", alpha = 0.7, scale = 0.9)


ggplot(iris, aes(x = Sepal.Length, y = Species)) + geom_density_ridges(jittered_points = TRUE,position = position_points_jitter(width = 0.05, height = 0), point_shape = "|",point_size = 3, point_alpha = 1, alpha = 0.7, )


ggplot(iris, aes(x = Sepal.Length, y = Species, fill = Species)) + geom_density_ridges(aes(point_color = Species,point_fill = Species, point_shape = Species), alpha = 0.2, point_alpha = 1, jittered_points = TRUE) +scale_point_color_hue(l = 40) + scale_discrete_manual(aesthetics = "point_shape",values = c(21, 22, 23))


ggplot(iris, aes(x = Sepal.Length, y = Species, fill = Species)) + geom_density_ridges(aes(point_shape = Species,point_fill = Species, point_size = Petal.Length), alpha = 0.2, point_alpha = 1,jittered_points = TRUE) + scale_point_color_hue(l = 40) + scale_point_size_continuous(range = c(0.5,4)) + scale_discrete_manual(aesthetics = "point_shape", values = c(21, 22, 23))

类似地,我们有垂直线 vline_size。垂直线也可以移动,这样它们就和抖动点对齐了。这允许我们生成如下的数字:

ggplot(iris, aes(x = Sepal.Length, y = Species)) + geom_density_ridges(jittered_points = TRUE,quantile_lines = TRUE, scale = 0.9, alpha = 0.7, vline_size = 1, vline_color = "red",point_size = 0.4, point_alpha = 1, position = position_raincloud(adjust_vlines = TRUE))


ggplot(Aus_athletes, aes(x = height, y = sport, color = sex, point_color = sex, fill = sex)) +geom_density_ridges(jittered_points = TRUE, scale = 0.95, rel_min_height = 0.01,point_shape = "|", point_size = 3, size = 0.25, position = position_points_jitter(height = 0)) +scale_y_discrete(expand = c(0, 0)) + scale_x_continuous(expand = c(0, 0), name = "height [cm]") +scale_fill_manual(values = c("#D55E0050", "#0072B250"), labels = c("female","male")) + scale_color_manual(values = c("#D55E00", "#0072B2"), guide = "none") +scale_discrete_manual("point_color", values = c("#D55E00", "#0072B2"), guide = "none") +coord_cartesian(clip = "off") + guides(fill = guide_legend(override.aes = list(fill = c("#D55E00A0","#0072B2A0"), color = NA, point_color = NA))) + ggtitle("Height in Australian athletes") +theme_ridges(center = TRUE)





Wilke C (2022). ggridges: Ridgeline Plots in ‘ggplot2’. R package version 0.5.4, https://wilkelab.org/ggridges/.

