R语言层次关系数据可视化
知识点总览
- 旭日图 (Sunburst Chart):多层圆环展示层次结构比例
- 树状图 (Dendrogram/Tree Diagram):展示聚类关系或层级结构
- 桑基图 (Sankey Diagram):展示数据流动与转换关系
- 矩形树状图 (Treemap):嵌套矩形展示层次数据占比
- 圆堆积图 (Circle Packing):嵌套圆展示层次结构与权重
1. 旭日图 (Sunburst Chart)
语法知识点
旭日图通过多层圆环表示层次关系,内层到外层逐级细分。常用包:
sunburstR: 交互式旭日图ggplot2+geom_rect(极坐标转换)plotly的plot_ly(type = 'sunburst')
核心概念:
- 根节点在最内圈,子节点向外扩展
- 每个扇区角度与数值大小成正比
- 颜色通常用于区分不同类别
案例代码
r
# 安装并加载必要的包
if(!require(plotly)) install.packages("plotly")
if(!require(dplyr)) install.packages("dplyr")
library(plotly)
library(dplyr)
# 创建层次数据:公司销售结构
# 数据结构需要包含每个节点的ID、父节点ID和数值
sunburst_data <- data.frame(
# 节点唯一标识
id = c("total", "product", "service", "hardware", "software",
"consulting", "support", "laptop", "desktop", "app", "cloud"),
# 父节点标识(根节点的父节点为NA)
parent = c(NA, "total", "total", "product", "product",
"service", "service", "hardware", "hardware", "software", "software"),
# 数值(叶子节点必须有值,内部节点自动汇总)
value = c(0, 0, 0, 0, 0, 0, 0, 450, 320, 280, 190),
# 节点标签
label = c("总销售额", "产品", "服务", "硬件", "软件",
"咨询", "技术支持", "笔记本电脑", "台式电脑", "移动应用", "云服务"),
# 颜色分组
group = c("root", "category", "category", "sub", "sub",
"sub", "sub", "leaf", "leaf", "leaf", "leaf")
)
# 绘制旭日图
fig <- plot_ly(
data = sunburst_data,
labels = ~label, # 节点标签
parents = ~parent, # 父节点
values = ~value, # 数值
type = 'sunburst', # 图表类型
branchvalues = 'total', # 'total'表示子节点值总和等于父节点
# 颜色映射
marker = list(
colors = c("root" = "#2E86AB",
"category" = "#A23B72",
"sub" = "#F18F01",
"leaf" = "#C73E1D")
),
# 悬停信息
hovertemplate = paste(
"<b>%{label}</b><br>",
"数值: %{value}<br>",
"占比: %{percentRoot}%%<br>",
"<extra></extra>" # 隐藏默认信息
)
) %>%
layout(
title = list(text = "旭日图:公司销售结构", font = list(size = 16)),
# 调整边距
margin = list(t = 50, l = 0, r = 0, b = 0)
)
# 显示图表
fig
# 另一种方法:使用ggplot2创建简单的环形层次图
if(!require(ggplot2)) install.packages("ggplot2")
library(ggplot2)
# 准备数据:层次结构需要预处理为矩形坐标形式
# 创建示例数据:不同地区的销售占比
region_data <- data.frame(
level1 = c("亚洲", "亚洲", "欧洲", "欧洲", "美洲", "美洲"),
level2 = c("中国", "日本", "德国", "法国", "美国", "加拿大"),
value = c(380, 210, 290, 180, 420, 150)
)
# 计算累计占比用于定位
region_data$ymax <- cumsum(region_data$value)
region_data$ymin <- lag(region_data$ymax, default = 0)
region_data$mid <- (region_data$ymax + region_data$ymin) / 2
# 绘制基础条形图
p <- ggplot(region_data, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3,
fill = level1)) +
geom_rect() + # 绘制矩形环
coord_polar(theta = "y") + # 转换为极坐标(环形)
xlim(c(2, 4)) + # 控制环的宽度
theme_void() + # 移除背景
labs(title = "旭日图(简化版):地区销售分布") +
theme(legend.position = "right")
print(p)
2. 树状图 (Dendrogram / Tree Diagram)
语法知识点
树状图用于展示层次聚类结果或决策树结构。常用包:
stats::hclust()+plot(): 聚类树状图openintro::treeDiag(): 决策树图collapsibleTree: 交互式可折叠树图
核心概念:
- 聚类树状图:通过距离矩阵展示样本间的层次聚类关系
- 决策树:展示分类/回归的决策路径
案例代码
2.1 聚类树状图
r
# 使用内置数据集USArrests进行层次聚类
# 查看数据前几行
head(USArrests)
# 计算距离矩阵(使用欧氏距离)
# 由于变量量纲不同,先进行标准化
scaled_data <- scale(USArrests) # 标准化(均值为0,标准差为1)
dist_matrix <- dist(scaled_data, method = "euclidean") # 计算距离
# 执行层次聚类
# method参数:ward.D2(最小方差法)、complete(最长距离法)、single(最短距离法)
hc <- hclust(dist_matrix, method = "ward.D2")
# 绘制基础树状图
plot(hc,
main = "树状图:美国各州犯罪率层次聚类", # 标题
xlab = "州", # x轴标签
ylab = "距离", # y轴标签
sub = "", # 移除副标题
hang = -1, # 让标签对齐底部
cex = 0.6) # 标签字体大小
# 在树状图上添加聚类框(分成3个簇)
rect.hclust(hc, k = 3, border = c("red", "blue", "green"))
# 添加图例
legend("topright",
legend = c("簇1", "簇2", "簇3"),
fill = c("red", "blue", "green"),
title = "聚类结果")
# 查看每个州所属的聚类
cluster_assignment <- cutree(hc, k = 3)
head(cluster_assignment)
# 使用ggplot2绘制更美观的树状图
if(!require(ggplot2)) install.packages("ggplot2")
if(!require(ggdendro)) install.packages("ggdendro")
library(ggplot2)
library(ggdendro)
# 将hclust对象转换为可绘制的数据框
dendro_data <- dendro_data(hc, type = "rectangle")
# 绘制树状图
ggplot() +
geom_segment(data = dendro_data$segments,
aes(x = x, y = y, xend = xend, yend = yend)) +
geom_text(data = dendro_data$labels,
aes(x = x, y = y, label = label),
hjust = 0, angle = 90, size = 3) +
theme_minimal() +
labs(title = "树状图(ggplot2版本):美国各州聚类分析",
x = "州", y = "距离") +
theme(axis.text.x = element_blank()) # 隐藏x轴标签(已在图中显示)
2.2 决策树图
r
# 使用openintro包的treeDiag绘制决策树
if(!require(openintro)) install.packages("openintro")
library(openintro)
# 案例:预测客户是否购买产品
# 定义第一层决策:广告是否有效
# 定义第二层决策:价格是否合适
# 设置概率:
# 第一层分支:广告有效(0.6),广告无效(0.4)
prob_level1 <- c(0.6, 0.4)
# 第二层分支条件概率:
# 当广告有效时:价格合适(0.7),价格不合适(0.3)
# 当广告无效时:价格合适(0.3),价格不合适(0.7)
prob_level2 <- list(
c(0.7, 0.3), # 对应第一层第一个分支的子概率
c(0.3, 0.7) # 对应第一层第二个分支的子概率
)
# 绘制决策树
treeDiag(
main = c("广告是否有效?", "价格是否合适?"), # 每层问题
p1 = prob_level1, # 第一层概率
p2 = prob_level2, # 第二层概率
out1 = c("有效", "无效"), # 第一层输出标签
out2 = c("合适", "不合适"), # 第二层输出标签
showSol = TRUE, # 显示联合概率解
digits = 3, # 小数位数
col.main = "#2E86AB", # 标题颜色
cex.main = 1.2 # 标题字体大小
)
3. 桑基图 (Sankey Diagram)
语法知识点
桑基图用于展示数据在节点间的流动,边的宽度与流量成正比。常用包:
networkD3::sankeyNetwork(): 交互式桑基图ggalluvial: ggplot2扩展的冲积图plotly::plot_ly(type = 'sankey')
数据结构要求:
- nodes: 节点列表(每个节点唯一标识)
- links : 连接关系(source、target、value三列)
- source/target 使用节点索引(从0开始)
案例代码
r
# 安装并加载networkD3包
if(!require(networkD3)) install.packages("networkD3")
library(networkD3)
# 案例:网站用户流量转化路径
# 定义节点(所有可能的阶段)
# 注意:节点顺序决定了索引(从0开始)
nodes <- data.frame(
name = c("访问首页", # 索引0
"浏览产品", # 索引1
"加入购物车", # 索引2
"开始结算", # 索引3
"支付成功", # 索引4
"支付失败", # 索引5
"流失用户") # 索引6
)
# 定义连接关系
# source: 起始节点索引
# target: 目标节点索引
# value: 流量大小
links <- data.frame(
source = c(0, 0, 1, 1, 2, 2, 3, 3, 4, 5),
target = c(1, 6, 2, 6, 3, 6, 4, 5, 4, 6),
value = c(1000, 300, 650, 350, 550, 100, 450, 100, 420, 130)
)
# 绘制桑基图
sankey_graph <- sankeyNetwork(
Links = links, # 连接数据框
Nodes = nodes, # 节点数据框
Source = "source", # 源节点列名
Target = "target", # 目标节点列名
Value = "value", # 数值列名
NodeID = "name", # 节点标签列名
units = "人", # 数值单位
fontSize = 12, # 字体大小
nodeWidth = 20, # 节点宽度
nodePadding = 20, # 节点间距
# 颜色方案
colourScale = JS('d3.scaleOrdinal(d3.schemeCategory10)'),
# 悬停信息格式
sinksRight = FALSE # 是否将终点节点对齐到右侧
)
# 显示桑基图
sankey_graph
# 使用plotly创建桑基图(另一种交互方式)
if(!require(plotly)) install.packages("plotly")
library(plotly)
# 准备plotly格式的数据
plotly_sankey <- plot_ly(
type = "sankey",
orientation = "h", # 水平方向
# 节点设置
node = list(
label = nodes$name, # 节点标签
color = c("#2E86AB", "#A23B72", "#F18F01",
"#C73E1D", "#6A994E", "#BC4A3C", "#8B8C89"),
pad = 15, # 节点内边距
thickness = 20, # 节点厚度
line = list(color = "black", width = 0.5)
),
# 连接设置
link = list(
source = links$source, # 源节点索引
target = links$target, # 目标节点索引
value = links$value, # 流量值
color = "rgba(100, 100, 100, 0.3)" # 连接线颜色(带透明度)
)
) %>%
layout(
title = "桑基图:网站用户转化路径",
font = list(size = 12),
xaxis = list(showgrid = FALSE, zeroline = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE)
)
plotly_sankey
# 使用ggalluvial包绘制冲积图(桑基图的ggplot2版本)
if(!require(ggalluvial)) install.packages("ggalluvial")
if(!require(dplyr)) install.packages("dplyr")
library(ggalluvial)
library(dplyr)
# 创建学生课程选择数据
alluvial_data <- data.frame(
student_id = rep(1:200, each = 3),
semester = rep(c("第一学期", "第二学期", "第三学期"), times = 200),
course = sample(c("数学", "物理", "化学", "生物"), 600, replace = TRUE),
grade = sample(c("A", "B", "C", "D"), 600, replace = TRUE, prob = c(0.3, 0.4, 0.2, 0.1))
)
# 统计各路径人数
alluvial_summary <- alluvial_data %>%
group_by(semester, course, grade) %>%
summarise(count = n(), .groups = 'drop')
# 绘制冲积图
ggplot(alluvial_summary,
aes(axis1 = semester, axis2 = course, axis3 = grade, y = count)) +
geom_alluvium(aes(fill = course), width = 1/12) + # 绘制流动带
geom_stratum(width = 1/12, fill = "grey90", color = "black") + # 绘制层
geom_label(stat = "stratum", aes(label = after_stat(stratum))) + # 添加标签
scale_x_discrete(limits = c("第一学期", "第二学期", "第三学期"), expand = c(0.05, 0.05)) +
labs(title = "冲积图(桑基图变体):学生课程选择变化",
x = "学期", y = "学生人数") +
theme_minimal() +
theme(legend.position = "bottom")
4. 矩形树状图 (Treemap)
语法知识点
矩形树状图通过嵌套矩形展示层次数据的占比关系。常用包:
treemap::treemap(): 功能全面的矩形树状图ggplot2+treemapify: ggplot2扩展
核心概念:
- 矩形面积与数值大小成正比
- 嵌套矩形表示层次结构
- 颜色可编码第二维变量
案例代码
r
# 安装并加载treemap包
if(!require(treemap)) install.packages("treemap")
library(treemap)
# 使用内置数据集GNI2014(各国国民总收入数据)
data(GNI2014)
head(GNI2014)
# 基础矩形树状图
treemap(GNI2014,
# 层次结构:先按大洲分组,再按国家细分
index = c("continent", "iso3"),
# 面积映射:人口数量
vSize = "population",
# 颜色映射:人均国民总收入
vColor = "GNI",
# 颜色类型:数值型变量使用连续渐变
type = "value",
# 标题
title = "矩形树状图:全球人口与收入分布",
# 调色板
palette = "RdYlGn",
# 反转颜色(高收入用绿色)
range = c(0, 1),
# 标签格式
fontsize.labels = c(12, 8), # 第一层字体12,第二层字体8
fontcolor.labels = c("white", "black"),
fontface.labels = c(2, 1), # 第一层粗体,第二层常规
bg.labels = 0, # 标签背景透明度
# 边框设置
border.col = c("black", "white"),
border.lwds = c(2, 1),
# 算法:支持"pivot"(默认)或"strip"
algorithm = "pivot",
# 显示悬停信息(仅交互式)
mapping = list(levels = 2)
)
# 使用treemapify包(ggplot2风格)
if(!require(treemapify)) install.packages("treemapify")
if(!require(ggplot2)) install.packages("ggplot2")
library(treemapify)
library(ggplot2)
# 准备数据:公司各产品线销售数据
sales_data <- data.frame(
division = c("消费电子", "消费电子", "消费电子",
"企业服务", "企业服务",
"云服务", "云服务", "云服务"),
product = c("智能手机", "平板电脑", "可穿戴设备",
"数据分析", "IT咨询",
"云存储", "云计算", "AI服务"),
sales = c(1250, 680, 320, 890, 560, 2100, 1850, 950),
growth = c(0.12, -0.05, 0.25, 0.08, 0.03, 0.35, 0.28, 0.42)
)
# 绘制矩形树状图
ggplot(sales_data,
aes(area = sales, # 面积映射
fill = growth, # 颜色映射(增长率)
label = product, # 标签
subgroup = division)) + # 分组(第一层)
geom_treemap() + # 绘制矩形
geom_treemap_subgroup_border(colour = "black", size = 1.5) + # 分组边框
geom_treemap_subgroup_text(place = "centre", # 分组标签位置
grow = TRUE, # 自动调整大小
alpha = 0.7, # 透明度
colour = "black", # 文字颜色
fontface = "bold") + # 粗体
geom_treemap_text(place = "topleft", # 产品标签位置
grow = FALSE, # 不自动调整
reflow = TRUE, # 文字换行
colour = "white") + # 文字颜色
scale_fill_gradient2(low = "red", mid = "yellow", high = "green",
midpoint = 0.15, name = "增长率") +
labs(title = "矩形树状图:公司产品销售分布",
subtitle = "矩形面积 = 销售额(万元),颜色 = 增长率") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.title = element_blank())
5. 圆堆积图 (Circle Packing)
语法知识点
圆堆积图使用嵌套圆展示层次关系,圆的面积与数值成正比。常用包:
packcircles: 圆布局算法ggplot2+ggforce::geom_circle(): 绘图circlize: 圆形可视化
核心概念:
- 圆面积与数据值成比例
- 嵌套表示层次关系
- 布局算法避免重叠
案例代码
r
# 安装并加载必要的包
if(!require(packcircles)) install.packages("packcircles")
if(!require(ggplot2)) install.packages("ggplot2")
if(!require(ggforce)) install.packages("ggforce")
if(!require(dplyr)) install.packages("dplyr")
library(packcircles)
library(ggplot2)
library(ggforce)
library(dplyr)
# 案例1:单层圆堆积图(展示不同类别的相对大小)
# 创建数据:公司各部门预算占比
dept_data <- data.frame(
department = c("研发部", "市场部", "销售部", "人事部", "财务部",
"运营部", "法务部", "IT部"),
budget = c(450, 320, 380, 120, 110, 280, 90, 210) # 预算金额(万元)
)
# 使用packcircles计算圆的位置
# 方法1:渐进布局(Progressive Layout)- 适合大多数情况
packing <- circleProgressiveLayout(dept_data$budget, sizetype = 'area')
# 将布局结果合并到原数据
dept_data <- cbind(dept_data, packing)
# 生成用于绘制的圆顶点数据
dept_vertices <- circleLayoutVertices(packing, npoints = 100)
# 绘制圆堆积图
ggplot() +
# 绘制圆(使用ggforce的geom_circle或直接使用多边形)
geom_polygon(data = dept_vertices,
aes(x = x, y = y, group = id, fill = as.factor(id)),
colour = "white", size = 1, alpha = 0.8) +
# 添加文字标签
geom_text(data = dept_data,
aes(x = x, y = y, label = paste(department, "\n", budget, "万")),
size = 3.5, fontface = "bold") +
# 颜色方案
scale_fill_brewer(palette = "Set3", guide = "none") +
# 主题设置
theme_void() +
coord_equal() + # 保持圆形比例
labs(title = "圆堆积图:公司各部门预算分布",
subtitle = "圆的面积与预算金额成正比")
# 案例2:多层圆堆积图(展示层次结构)
# 创建层次数据:公司 - 部门 - 团队三层结构
hierarchy_data <- data.frame(
# 第一层:公司
company = rep("科技集团", 8),
# 第二层:部门
department = c("研发部", "研发部", "研发部", "市场部", "市场部", "销售部", "销售部", "销售部"),
# 第三层:团队
team = c("前端组", "后端组", "AI组", "品牌组", "数字营销组", "大客户组", "渠道组", "电销组"),
# 第四层:数值
headcount = c(25, 30, 18, 12, 20, 35, 22, 28)
)
# 计算第一层(部门)的总人数
dept_agg <- hierarchy_data %>%
group_by(department) %>%
summarise(total = sum(headcount), .groups = 'drop')
# 为第一层圆布局
dept_layout <- circleProgressiveLayout(dept_agg$total, sizetype = 'area')
dept_agg <- cbind(dept_agg, dept_layout)
# 为每个部门内的团队计算布局(嵌套)
all_team_vertices <- data.frame()
team_labels <- data.frame()
for(i in 1:nrow(dept_agg)) {
# 获取当前部门
dept_name <- dept_agg$department[i]
dept_x <- dept_agg$x[i]
dept_y <- dept_agg$y[i]
dept_r <- dept_agg$radius[i]
# 筛选当前部门的团队数据
team_data <- hierarchy_data %>%
filter(department == dept_name)
if(nrow(team_data) > 0) {
# 在部门圆内布局团队圆
# 调整布局区域为部门圆的内部
team_layout <- circleProgressiveLayout(team_data$headcount, sizetype = 'area')
# 缩放团队布局到部门圆内
# 计算当前布局的最大半径
max_radius <- max(team_layout$radius)
scale_factor <- (dept_r - max_radius * 0.2) / max(team_layout$x^2 + team_layout$y^2)^0.5
team_layout$x <- team_layout$x * scale_factor + dept_x
team_layout$y <- team_layout$y * scale_factor + dept_y
# 生成顶点
team_vertices <- circleLayoutVertices(team_layout, npoints = 50)
team_vertices$department <- dept_name
# 收集标签位置
team_labels_temp <- data.frame(
x = team_layout$x,
y = team_layout$y,
label = paste(team_data$team, "\n", team_data$headcount, "人"),
department = dept_name
)
all_team_vertices <- rbind(all_team_vertices, team_vertices)
team_labels <- rbind(team_labels, team_labels_temp)
}
}
# 绘制多层圆堆积图
ggplot() +
# 第一层:部门圆(浅色背景)
geom_circle(data = dept_agg,
aes(x0 = x, y0 = y, r = radius, fill = department),
alpha = 0.3, colour = "gray40", size = 1.2) +
# 第二层:团队圆
geom_polygon(data = all_team_vertices,
aes(x = x, y = y, group = id, fill = department),
colour = "white", alpha = 0.9, size = 0.5) +
# 团队标签
geom_text(data = team_labels,
aes(x = x, y = y, label = label),
size = 3, fontface = "bold") +
# 部门标签(置于顶部)
geom_text(data = dept_agg,
aes(x = x, y = y + radius + 0.5, label = department),
size = 5, fontface = "bold") +
scale_fill_brewer(palette = "Set2", name = "部门") +
theme_void() +
coord_equal() +
labs(title = "圆堆积图(多层):公司组织架构",
subtitle = "外圆 = 部门(面积=总人数),内圆 = 团队(面积=团队人数)")
本章小结
| 图表类型 | 主要用途 | 关键函数/包 | 适用场景 |
|---|---|---|---|
| 旭日图 | 多层比例展示 | plotly::sunburst |
销售构成、市场份额层级 |
| 树状图 | 聚类关系/决策路径 | hclust + plot、treeDiag |
聚类分析、决策树 |
| 桑基图 | 数据流动/转化 | networkD3::sankeyNetwork |
用户转化、资金流向 |
| 矩形树状图 | 嵌套矩形占比 | treemap::treemap |
多级分类占比 |
| 圆堆积图 | 嵌套圆层次 | packcircles + ggplot2 |
组织结构、分类比较 |
选择建议
- 旭日图:适合展示比例层级,尤其当需要强调内到外的层次关系时
- 树状图:聚类分析的标准选择;决策树用于解释模型逻辑
- 桑基图:最适合展示流程、转化路径或资源分配
- 矩形树状图:空间利用率高,适合大量分类的占比展示
- 圆堆积图:视觉吸引力强,适合展示组织结构或嵌套分组
所有代码均可直接复制到R环境中运行(需联网安装缺失包)。建议根据实际数据调整参数,观察图形变化以获得最佳可视化效果。