Superset 大数据可视化 --- 完整知识点与案例代码
一、Superset 基础概念与架构
1.1 Superset 核心架构
Superset 采用 Flask + React 架构,后端通过 SQLAlchemy 连接多种数据库,前端使用 React + D3.js 渲染可视化图表。
yaml
# Superset 架构组成(YAML 描述)
# ============================================================
# Web Server: 处理 HTTP 请求(Gunicorn / Nginx)
# Metadata Database: 存储仪表板、图表、用户权限等元数据(PostgreSQL / MySQL)
# Data Warehouse: 存储实际业务数据(ClickHouse / Presto / Hive 等)
# Cache Layer: Redis 缓存查询结果,提升响应速度
# Celery Worker: 异步任务执行(报表推送、SQL Lab 异步查询等)
# Message Broker: RabbitMQ / Redis 作为 Celery 的消息中间件
# ============================================================
# 推荐的生产环境架构拓扑:
#
# 用户浏览器
# │
# ▼
# Nginx (反向代理 + 静态资源)
# │
# ▼
# Gunicorn (多个 Superset Worker)
# │
# ├──► PostgreSQL (元数据存储)
# ├──► Redis (缓存 + Celery Broker)
# ├──► Celery Worker (异步任务)
# └──► 数据仓库集群 (ClickHouse / Hive / Presto 等)
二、安装与部署
2.1 Docker Compose 安装(最常用方式)
bash
# ============================================================
# 步骤 1: 克隆 Superset 官方仓库
# ============================================================
# --depth 1 表示只克隆最新提交,加快下载速度
git clone --depth 1 https://github.com/apache/superset.git
# 进入项目根目录
cd superset
# ============================================================
# 步骤 2: 使用 Docker Compose 启动所有服务
# ============================================================
# -d 表示后台运行(detached mode)
# --build 表示在启动前重新构建镜像
# 该命令会启动: superset-worker, superset-init, redis, postgres, superset 等容器
docker-compose -f docker-compose-non-dev.yml up -d --build
# ============================================================
# 步骤 3: 创建管理员账户(首次启动时)
# ============================================================
# 进入 superset 容器的 bash 终端
# -T 表示不分配伪终端(适用于脚本场景)
docker exec -it superset_app bash
# 在容器内创建管理员用户
# --username: 登录用户名
# --firstname: 用户名
# --lastname: 用户姓氏
# --email: 管理员邮箱
# --password: 登录密码(生产环境请使用强密码)
superset fab create-admin \
--username admin \
--firstname Admin \
--lastname User \
--email admin@superset.com \
--password admin123
# ============================================================
# 步骤 4: 初始化数据库和默认角色权限
# ============================================================
# superset db upgrade: 将元数据数据库升级到最新 schema 版本
superset db upgrade
# superset init: 初始化默认角色(Admin, Alpha, sql_lab 等)
# 并加载默认的示例数据和可视化插件
superset init
# ============================================================
# 步骤 5: 退出容器并验证
# ============================================================
exit
# 访问 http://localhost:8088 即可看到 Superset 登录界面
2.2 PyPI 安装(开发环境)
bash
# ============================================================
# 步骤 1: 创建 Python 虚拟环境
# ============================================================
# Superset 3.x 要求 Python 3.9+
# -m venv 使用 Python 内置的 venv 模块创建隔离环境
python3 -m venv superset_env
# 激活虚拟环境
# macOS/Linux:
source superset_env/bin/activate
# Windows:
# superset_env\Scripts\activate
# ============================================================
# 步骤 2: 安装 Superset 及依赖
# ============================================================
# 先升级 pip 到最新版本
pip install --upgrade pip setuptools wheel
# 安装 Apache Superset
# [hive] 表示同时安装 Hive 数据库驱动支持
# [mysql] 表示同时安装 MySQL 驱动支持
pip install apache-superset
# 如果需要连接特定数据库,额外安装对应驱动
pip install clickhouse-connect # ClickHouse
pip install psycopg2-binary # PostgreSQL
pip install mysqlclient # MySQL
pip install pyhive[hive] # Hive
pip install pyhive[presto] # Presto
# ============================================================
# 步骤 3: 配置 SECRET_KEY
# ============================================================
# Superset 要求设置一个安全密钥用于加密会话 cookie 等
# 生成一个随机密钥
python -c "import secrets; print(secrets.token_hex(32))"
# 将输出的密钥写入配置文件 superset_config.py(见下文配置章节)
# ============================================================
# 步骤 4: 初始化数据库
# ============================================================
# 设置 Flask 应用的入口模块
export FLASK_APP=superset
# 数据库迁移升级
superset db upgrade
# 创建管理员用户
superset fab create-admin \
--username admin \
--firstname Admin \
--lastname User \
--email admin@example.com \
--password admin123
# 初始化角色和权限
superset init
# ============================================================
# 步骤 5: 启动开发服务器
# ============================================================
# -p 8088: 监听端口
# -h 0.0.0.0: 监听所有网络接口(允许外部访问)
# --with-threads: 启用多线程处理请求
# --reload: 代码修改后自动重启(开发模式)
superset run -p 8088 -h 0.0.0.0 --with-threads --reload
2.3 Kubernetes Helm 安装
yaml
# ============================================================
# superset-k8s-values.yaml
# Helm Chart 自定义配置文件
# ============================================================
# --- Superset 应用配置 ---
# 初始化容器:在 Superset Pod 启动前执行数据库迁移和初始化
init:
# 启用初始化容器
enabled: true
# 初始化时执行的命令
# db upgrade: 升级数据库 schema
# init: 初始化角色和权限
# load-examples: 加载示例数据(生产环境建议关闭)
adminUser:
username: admin # 管理员用户名
firstname: Admin # 名
lastname: User # 姓
email: admin@superset.cn
password: StrongP@ss123 # 生产环境请使用 Secrets 管理
# --- 数据库连接配置 ---
# Superset 元数据数据库(生产环境推荐使用外部 PostgreSQL)
# 如果使用内嵌 PostgreSQL,设置 postgresql.enabled = true
supersetNode:
connections:
# 数据库连接 SQLAlchemy URI
# postgresql+psycopg2://用户名:密码@主机:端口/数据库名
dbHost: "external-postgresql.default.svc.cluster.local"
dbPort: "5432"
dbName: "superset"
dbUser: "superset"
dbPass: "superset_password"
# --- Redis 配置(缓存 + Celery Broker)---
redis:
enabled: true
# Redis 密码(生产环境建议设置)
password: "redis_password"
# --- Celery 配置(异步任务)---
celery:
# 启用 Celery Worker
enabled: true
# Worker 并发数
concurrency: 4
# --- Ingress 配置(外部访问入口)---
ingress:
enabled: true
# 绑定的域名
hosts:
- name: superset.example.com
# TLS 证书配置
tls:
- secretName: superset-tls
hosts:
- superset.example.com
# --- 副本数配置 ---
# Superset Web 节点的副本数
replicaCount: 2
# Celery Worker 的副本数
worker:
replicaCount: 2
bash
# ============================================================
# 使用 Helm 安装 Superset 到 Kubernetes
# ============================================================
# 步骤 1: 添加 Superset Helm 仓库
helm repo add superset https://apache.github.io/superset
# 更新本地仓库索引
helm repo update
# 步骤 2: 创建命名空间
kubectl create namespace superset
# 步骤 3: 安装 Superset Chart
# -n superset: 安装到 superset 命名空间
# -f: 指定自定义 values 文件
# --timeout 10m: 等待超时时间设为 10 分钟
# superset-chart: 发布名称(release name)
helm install superset-chart superset/superset \
-n superset \
-f superset-k8s-values.yaml \
--timeout 10m
# 步骤 4: 查看 Pod 状态
kubectl get pods -n superset
# 步骤 5: 端口转发以便本地访问(开发调试用)
kubectl port-forward svc/superset-chart 8088:8088 -n superset
三、核心配置详解
3.1 superset_config.py 主配置文件
python
# ============================================================
# superset_config.py
# Superset 的核心配置文件,放在 Python 可以 import 的路径下
# 环境变量 SUPERSET_CONFIG_PATH 可指定自定义路径
# ============================================================
import os
from datetime import timedelta
from celery.schedules import crontab
# ============================================================
# 1. 安全配置
# ============================================================
# SECRET_KEY: 用于加密会话 Cookie、数据库连接密码等敏感信息
# 生产环境必须设置!可通过 `python -c "import secrets; print(secrets.token_hex(32))"` 生成
SECRET_KEY = os.environ.get("SUPERSET_SECRET_KEY", "your-secret-key-change-in-production-abc123")
# CSRF 令牌过期时间(秒),默认 3600(1小时)
WTF_CSRF_TIME_LIMIT = 3600
# 是否启用跨站请求伪造保护(生产环境必须为 True)
WTF_CSRF_ENABLED = True
# ============================================================
# 2. 元数据数据库配置
# ============================================================
# SQLAlchemy 数据库连接 URI
# 格式: dialect+driver://username:password@host:port/database
# 生产环境推荐使用 PostgreSQL
SQLALCHEMY_DATABASE_URI = os.environ.get(
"METADATA_DB_URI",
"postgresql+psycopg2://superset:superset_password@localhost:5432/superset"
)
# SQLAlchemy 连接池大小(并发连接数)
SQLALCHEMY_POOL_SIZE = 10
# 连接池溢出大小:允许超出 pool_size 的额外连接数
SQLALCHEMY_MAX_OVERFLOW = 20
# 连接超时时间(秒)
SQLALCHEMY_POOL_TIMEOUT = 30
# 连接回收时间(秒):超过此时间的空闲连接将被回收
# 防止数据库端因超时断开连接
SQLALCHEMY_POOL_RECYCLE = 3600
# ============================================================
# 3. 缓存配置(Redis)
# ============================================================
# 使用 Redis 作为缓存后端
CACHE_CONFIG = {
# 缓存类型:使用 Flask-Caching 的 Redis 后端
"CACHE_TYPE": "RedisCache",
# Redis 连接地址
"CACHE_DEFAULT_TIMEOUT": 300, # 默认缓存过期时间:300 秒(5 分钟)
"CACHE_KEY_PREFIX": "superset_", # 缓存键前缀,防止与其他应用冲突
"CACHE_REDIS_HOST": os.environ.get("REDIS_HOST", "localhost"),
"CACHE_REDIS_PORT": int(os.environ.get("REDIS_PORT", 6379)),
"CACHE_REDIS_DB": 1, # Redis 数据库编号(0-15)
"CACHE_REDIS_PASSWORD": os.environ.get("REDIS_PASSWORD", ""),
}
# 数据缓存(用于存储图表查询结果)
DATA_CACHE_CONFIG = {
"CACHE_TYPE": "RedisCache",
"CACHE_DEFAULT_TIMEOUT": 600, # 图表数据缓存 10 分钟
"CACHE_KEY_PREFIX": "superset_data_",
"CACHE_REDIS_HOST": os.environ.get("REDIS_HOST", "localhost"),
"CACHE_REDIS_PORT": 6379,
"CACHE_REDIS_DB": 2, # 使用不同的 Redis DB 隔离
}
# ============================================================
# 4. Celery 异步任务配置
# ============================================================
# 使用 Redis 作为 Celery 的消息 Broker(消息中间件)
class CeleryConfig:
"""
Celery 配置类
Superset 通过 Celery 执行异步任务,如:
- 异步 SQL Lab 查询
- 定时报表推送(Alerts & Reports)
- 数据库连接测试
"""
# Broker URL: Celery Worker 从这里获取任务消息
broker_url = os.environ.get(
"CELERY_BROKER_URL",
"redis://localhost:6379/0"
)
# Result Backend: 任务执行结果存储位置
result_backend = os.environ.get(
"CELERY_RESULT_BACKEND",
"redis://localhost:6379/0"
)
# 序列化格式:json 便于调试,生产环境可使用 msgpack 提升性能
task_serializer = "json"
result_serializer = "json"
# 内容类型
accept_content = ["json"]
# 时区设置
timezone = "Asia/Shanghai"
# 启用 UTC 时间
enable_utc = True
# Worker 并发数
worker_concurrency = 4
# 每个 Worker 预取的任务数(1 表示公平调度)
worker_prefetch_multiplier = 1
# 定时任务调度(Celery Beat)
beat_schedule = {
# 任务1: 每 30 分钟刷新仪表板缓存
"refresh-dashboard-cache": {
# 任务函数路径
"task": "superset.tasks.cache.refresh_dashboard_cache",
# 执行间隔:30 分钟 = 1800 秒
"schedule": timedelta(minutes=30),
},
# 任务2: 每天早上 8 点发送定时报表
"daily-report-email": {
"task": "superset.tasks.schedules.schedule_email_report",
# crontab 表达式:分钟=0, 小时=8, 即每天 08:00
"schedule": crontab(minute=0, hour=8),
},
}
# 将 CeleryConfig 类赋值给 CELERY_CONFIG
CELERY_CONFIG = CeleryConfig
# ============================================================
# 5. 功能开关配置
# ============================================================
# FEATURE_FLAGS: 控制 Superset 各种功能的启用/禁用
FEATURE_FLAGS = {
# 启用 DASHBOARD_NATIVE_FILTERS: 原生筛选器组件
"DASHBOARD_NATIVE_FILTERS": True,
# 启用 DASHBOARD_CROSS_FILTERS: 图表间的交叉筛选
"DASHBOARD_CROSS_FILTERS": True,
# 启用 SQL Lab 的模板渲染功能
"ENABLE_TEMPLATE_PROCESSING": True,
# 启用警报和报表推送功能
"ALERT_REPORTS": True,
# 启用嵌入式仪表板(iframe 嵌入第三方系统)
"EMBEDDED_SUPERSET": True,
# 启用数据集的列级权限控制
"COLUMN_LEVEL_SECURITY": True,
# 启用 SSH 隧道连接数据库(连接内网数据库时使用)
"SSH_TUNNELING": True,
}
# ============================================================
# 6. SQL Lab 配置
# ============================================================
# SQL Lab 允许执行的最长查询时间(秒)
SQLLAB_TIMEOUT = 300
# 是否允许在 SQL Lab 中执行多条语句
SQLLAB_CTAS_NO_LIMIT = True
# SQL Lab 结果集最大返回行数
ROW_LIMIT = 50000
# 默认查询结果行数限制
DEFAULT_SQLLAB_LIMIT = 1000
# ============================================================
# 7. 邮件配置(用于报表推送和警报)
# ============================================================
# SMTP 服务器配置
SMTP_HOST = os.environ.get("SMTP_HOST", "smtp.example.com")
SMTP_STARTTLS = True # 启用 STARTTLS 加密
SMTP_SSL = False # 不使用 SSL 直连(STARTTLS 已加密)
SMTP_PORT = int(os.environ.get("SMTP_PORT", 587))
SMTP_MAIL_FROM = os.environ.get("SMTP_MAIL_FROM", "superset@example.com")
SMTP_MAIL_SERVER = SMTP_HOST
# SMTP 认证信息(若 SMTP 服务器需要认证)
SMTP_USER = os.environ.get("SMTP_USER", "")
SMTP_PASSWORD = os.environ.get("SMTP_PASSWORD", "")
# ============================================================
# 8. 网络安全设置
# ============================================================
# 允许 Superset 嵌入到哪些域名的 iframe 中
# * 表示允许所有域名(生产环境请限制为具体域名)
TALISMAN_ENABLED = True
# 内容安全策略 (CSP) --- 防止 XSS 攻击
TALISMAN_CONFIG = {
"content_security_policy": {
# default-src: 默认资源加载策略
"default-src": ["'self'"],
# script-src: 允许加载脚本的来源
"script-src": ["'self'", "'unsafe-inline'"],
# style-src: 允许加载样式表的来源
"style-src": ["'self'", "'unsafe-inline'"],
# img-src: 允许加载图片的来源(data: 用于 base64 图片)
"img-src": ["'self'", "data:", "https://*.tile.openstreetmap.org"],
},
# 强制 HTTPS(生产环境设为 True)
"force_https": False,
}
# ============================================================
# 9. 日志配置
# ============================================================
# 事件日志(记录用户操作行为,如查看仪表板、执行查询等)
EVENT_LOGGER = "superset.utils.log.DBEventLogger"
# 是否将 SQL 查询记录到日志中(调试用,生产环境注意敏感信息)
LOG_SQL_QUERIES = True
# ============================================================
# 10. 导出配置
# ============================================================
# 导出资产时使用的 JSON 缩进空格数
EXPORT_FORMATS = ["csv", "json", "xlsx"]
CSV_EXPORT = {
"encoding": "utf-8-sig", # 使用 UTF-8 BOM 编码,兼容 Excel 打开中文
}
四、连接数据库
4.1 通过 UI 连接数据库
bash
# ============================================================
# 通过 Superset Web 界面连接数据库的操作路径
# ============================================================
# 1. 点击顶部导航栏 "Data" → "Databases"
# 2. 点击右上角 "+ Database" 按钮
# 3. 在弹出窗口中选择数据库类型或使用 SQLAlchemy URI 自定义连接
# 4. 填写连接信息后点击 "Test Connection" 验证连通性
# 5. 测试通过后点击 "Connect" 保存
4.2 通过 Python API 编程式连接数据库
python
# ============================================================
# connect_database.py
# 通过 Superset REST API 编程式添加数据库连接
# ============================================================
import requests
import json
# Superset 服务器地址
SUPERSET_URL = "http://localhost:8088"
# ============================================================
# 步骤 1: 认证登录,获取 Access Token
# ============================================================
# 登录接口的请求头
login_headers = {
"Content-Type": "application/json", # 请求体格式为 JSON
"Accept": "application/json", # 期望返回 JSON 格式
}
# 登录请求体
login_payload = {
"username": "admin", # 管理员用户名
"password": "admin123", # 管理员密码
"provider": "db", # 认证提供者:db 表示数据库认证
"refresh": True, # 返回 refresh token 用于续期
}
# 发送 POST 请求到登录接口
response = requests.post(
f"{SUPERSET_URL}/api/v1/security/login",
headers=login_headers,
data=json.dumps(login_payload), # 将字典序列化为 JSON 字符串
)
# 解析返回的 JSON,提取 access_token
# access_token 类似于 JWT,后续所有 API 请求都需要携带
access_token = response.json()["access_token"]
print(f"登录成功,获取到 Token: {access_token[:20]}...") # 只打印前20位
# ============================================================
# 步骤 2: 构造通用请求头
# ============================================================
# 后续所有 API 请求都使用此请求头
api_headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": f"Bearer {access_token}", # 携带 Token 认证身份
"X-CSRFToken": requests.get( # 从 Superset 获取 CSRF Token
f"{SUPERSET_URL}/api/v1/security/csrf_token/",
headers={"Authorization": f"Bearer {access_token}"},
).json()["result"],
"Referer": SUPERSET_URL, # CSRF 保护需要 Referer 头
}
# ============================================================
# 步骤 3: 添加 ClickHouse 数据库连接
# ============================================================
clickhouse_db_payload = {
"database_name": "clickhouse_prod", # 在 Superset 中显示的数据库名称
"sqlalchemy_uri": ( # SQLAlchemy 连接 URI
"clickhouse+connect://"
"default:" # 用户名
"clickhouse_password@" # 密码
"clickhouse-cluster.example.com:" # 主机地址
"8123/" # HTTP 端口
"analytics_db" # 数据库名
),
"expose_in_sqllab": True, # 是否在 SQL Lab 中显示
"allow_ctas": True, # 允许 CREATE TABLE AS
"allow_cvas": True, # 允许 CREATE VIEW AS
"allow_dml": True, # 允许 INSERT/UPDATE/DELETE
"allow_run_async": True, # 允许异步执行查询
"extra": json.dumps({
# ClickHouse 特定参数
"engine_params": {
"connect_args": {
"send_receive_timeout": 300, # 连接超时:300秒
}
},
# 元数据缓存配置
"metadata_cache_timeout": {
"schema_cache_timeout": 600, # schema 缓存 10 分钟
"table_cache_timeout": 600, # 表名缓存 10 分钟
},
}),
}
# 发送 POST 请求添加数据库
response = requests.post(
f"{SUPERSET_URL}/api/v1/database/",
headers=api_headers,
data=json.dumps(clickhouse_db_payload),
)
if response.status_code == 201:
db_id = response.json()["id"]
print(f"ClickHouse 数据库连接创建成功!数据库 ID: {db_id}")
else:
print(f"创建失败: {response.status_code} - {response.text}")
# ============================================================
# 步骤 4: 添加 Hive 数据库连接
# ============================================================
hive_db_payload = {
"database_name": "hive_warehouse",
"sqlalchemy_uri": (
"hive://"
"hive_user:" # 用户名
"hive_pass@" # 密码
"hive-server.example.com:" # HiveServer2 主机
"10000/" # HiveServer2 端口
"default" # 默认数据库
"?auth=CUSTOM" # 认证方式
),
"expose_in_sqllab": True,
"allow_ctas": True,
"allow_cvas": True,
"allow_dml": False, # Hive 通常不支持 DML
"allow_run_async": True, # Hive 查询通常很慢,建议开启异步
"extra": json.dumps({
"metadata_params": {},
"engine_params": {
"connect_args": {
"configuration": {
"hive.server2.thrift.resultset.default.fetch.size": 10000,
}
}
},
}),
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/database/",
headers=api_headers,
data=json.dumps(hive_db_payload),
)
print(f"Hive 连接创建结果: {response.status_code}")
# ============================================================
# 步骤 5: 列出所有已注册的数据库
# ============================================================
response = requests.get(
f"{SUPERSET_URL}/api/v1/database/",
headers=api_headers,
)
# 遍历返回结果,打印每个数据库的名称和 ID
for db in response.json()["result"]:
print(f" 数据库 ID: {db['id']}, 名称: {db['database_name']}")
五、数据集(Dataset)注册与管理
5.1 注册新表为数据集
python
# ============================================================
# register_dataset.py
# 通过 API 注册数据库中的表为 Superset 数据集
# ============================================================
import requests
import json
# 假设已获取 access_token 和 api_headers(参考上文)
SUPERSET_URL = "http://localhost:8088"
# ... headers 设置同上 ...
# ============================================================
# 步骤 1: 注册一个物理表为数据集
# ============================================================
# 参数说明:
# database: 数据库 ID(在 Superset 中注册的数据库的 ID)
# table_name: 表名
# schema: schema 名称(不同数据库可能没有 schema 概念)
dataset_payload = {
"database": 1, # 数据库 ID
"table_name": "orders", # 表名
"schema": "public", # schema 名称
}
# POST 请求创建数据集
response = requests.post(
f"{SUPERSET_URL}/api/v1/dataset/",
headers=api_headers,
data=json.dumps(dataset_payload),
)
if response.status_code == 201:
dataset_id = response.json()["id"]
print(f"数据集注册成功!数据集 ID: {dataset_id}")
# ============================================================
# 步骤 2: 自定义列属性
# ============================================================
# 更新数据集的列配置,设置指标、筛选器、显示名称等
column_update_payload = {
"columns": [
{
"column_name": "order_id",
"verbose_name": "订单ID", # 显示名称(中文别名)
"is_active": True, # 是否在数据集中激活
"filterable": True, # 是否可作为筛选条件
"groupby": True, # 是否可用于分组
"description": "唯一订单标识", # 列描述
},
{
"column_name": "order_amount",
"verbose_name": "订单金额",
"is_active": True,
"filterable": True,
"groupby": False, # 金额通常不分组
"type": "FLOAT", # 数据类型
"description": "订单总金额(单位:元)",
},
{
"column_name": "order_date",
"verbose_name": "订单日期",
"is_active": True,
"filterable": True,
"groupby": True,
"python_date_format": "%Y-%m-%d", # Python 日期解析格式
"description": "订单创建日期",
},
{
"column_name": "customer_region",
"verbose_name": "客户区域",
"is_active": True,
"filterable": True,
"groupby": True,
"description": "客户所属地理区域",
},
],
}
# PUT 请求更新数据集的列信息
response = requests.put(
f"{SUPERSET_URL}/api/v1/dataset/{dataset_id}",
headers=api_headers,
data=json.dumps(column_update_payload),
)
print(f"列属性更新结果: {response.status_code}")
# ============================================================
# 步骤 3: 添加计算指标(Metrics)
# ============================================================
metrics_payload = {
"metrics": [
{
# 指标名称(SQL 表达式)
"metric_name": "total_amount",
# 友好名称
"verbose_name": "总金额",
# SQL 表达式:使用 SUM 聚合函数
"expression": "SUM(order_amount)",
# 指标描述
"description": "所有订单的总金额",
},
{
"metric_name": "avg_amount",
"verbose_name": "平均订单金额",
# AVG 聚合函数
"expression": "AVG(order_amount)",
"description": "订单金额的平均值",
},
{
"metric_name": "order_count",
"verbose_name": "订单数量",
# COUNT 表达式
"expression": "COUNT(order_id)",
"description": "订单总数",
},
{
"metric_name": "distinct_customers",
"verbose_name": "客户数(去重)",
# COUNT DISTINCT 表达式
"expression": "COUNT(DISTINCT customer_id)",
"description": "去重后的客户数量",
},
{
"metric_name": "amount_ratio",
"verbose_name": "金额占比",
# 复杂 SQL 表达式:子查询计算占比
"expression": (
"SUM(order_amount) / "
"(SELECT SUM(order_amount) FROM orders)"
),
"description": "当前分组金额占总金额的比例",
},
],
}
response = requests.put(
f"{SUPERSET_URL}/api/v1/dataset/{dataset_id}",
headers=api_headers,
data=json.dumps(metrics_payload),
)
print(f"指标更新结果: {response.status_code}")
六、Superset 语义层(Semantic Layer)
6.1 语义层概念与配置
python
# ============================================================
# superset_semantic_layer.py
# 语义层是 Superset 的核心抽象层,用于定义:
# - 虚拟指标(Virtual Metrics)
# - 虚拟列(Virtual Calculated Columns)
# - 计量表达式(Metric Expressions)
# ============================================================
# ============================================================
# 1. 虚拟指标(Virtual Metrics)
# ============================================================
# 虚拟指标是基于 SQL 表达式定义的聚合计算
# 可以在图表中像普通指标一样使用
# 案例:定义常见的虚拟指标
virtual_metrics = [
{
"metric_name": "revenue_per_order",
"verbose_name": "客单价",
# SQL 表达式:总金额 / 订单数
# 注意:使用聚合函数包裹确保正确计算
"expression": "SUM(order_amount) / COUNT(DISTINCT order_id)",
"description": "平均每个订单的金额",
},
{
"metric_name": "week_over_week_growth",
"verbose_name": "周环比增长率",
# 使用窗口函数计算周环比
# LAG 取上一行的值,这里需要在 SQL Lab 中手动计算
"expression": "SUM(order_amount)",
"description": "本周与上周的金额变化百分比",
},
{
"metric_name": "profit_margin",
"verbose_name": "利润率",
# 利润率 = (收入 - 成本) / 收入
"expression": "(SUM(revenue) - SUM(cost)) / SUM(revenue) * 100",
"description": "利润占收入的百分比",
},
{
"metric_name": "high_value_orders_ratio",
"verbose_name": "大额订单占比",
# 条件聚合:使用 CASE WHEN 实现条件计数
# 大于 1000 元的订单数 / 总订单数
"expression": (
"SUM(CASE WHEN order_amount > 1000 THEN 1 ELSE 0 END) * 100.0 "
"/ COUNT(order_id)"
),
"description": "金额超过1000元的订单占比(百分比)",
},
]
# ============================================================
# 2. 虚拟计算列(Virtual Calculated Columns)
# ============================================================
# 计算列是基于 SQL 表达式定义的新列(行级别计算,非聚合)
# 会在 GROUP BY 之前生效
virtual_columns = [
{
"column_name": "order_size_category",
"verbose_name": "订单大小分类",
# 使用 CASE WHEN 将金额分为三个类别
"expression": (
"CASE "
" WHEN order_amount < 100 THEN '小额订单' "
" WHEN order_amount BETWEEN 100 AND 1000 THEN '中等订单' "
" WHEN order_amount > 1000 THEN '大额订单' "
" ELSE '未分类' "
"END"
),
"type": "VARCHAR",
"description": "根据订单金额大小自动分类",
},
{
"column_name": "is_weekend",
"verbose_name": "是否周末",
# 使用日期函数判断是否为周末
# DAYOFWEEK: 1=周日, 7=周六(MySQL 语法)
"expression": (
"CASE "
" WHEN DAYOFWEEK(order_date) IN (1, 7) THEN '周末' "
" ELSE '工作日' "
"END"
),
"type": "VARCHAR",
"description": "根据订单日期判断是否为周末",
},
{
"column_name": "profit",
"verbose_name": "利润",
# 简单的算术运算列
"expression": "revenue - cost",
"type": "FLOAT",
"description": "每行记录的利润(收入 - 成本)",
},
{
"column_name": "full_address",
"verbose_name": "完整地址",
# 字符串拼接
"expression": "CONCAT(province, city, district, street)",
"type": "VARCHAR",
"description": "拼接省市区街道为完整地址",
},
]
七、SQL Lab 与 SQL 模板
7.1 SQL Lab 基本查询
sql
-- ============================================================
-- SQL Lab: Superset 内置的 SQL 编辑器
-- 位于 Superset 顶部导航栏 "SQL" → "SQL Lab"
-- ============================================================
-- 基础查询:查看订单表前 100 行
SELECT
order_id, -- 订单ID
customer_id, -- 客户ID
order_amount, -- 订单金额
order_date, -- 订单日期
status -- 订单状态
FROM
orders -- 订单表
WHERE
order_date >= '2024-01-01' -- 只查询 2024 年以后的订单
ORDER BY
order_date DESC -- 按日期降序排列
LIMIT 100; -- 限制返回 100 行
7.2 Jinja2 SQL 模板(高级功能)
sql
-- ============================================================
-- Jinja2 SQL 模板
-- 在 SQL Lab 中使用 Jinja2 模板语法实现动态查询
-- 需要在 superset_config.py 中启用:
-- FEATURE_FLAGS = {"ENABLE_TEMPLATE_PROCESSING": True}
-- ============================================================
-- -------------------------------------------------------
-- 案例 1: 使用内置时间过滤器模板变量
-- {{ from_dttm }} 和 {{ to_dttm }} 会自动替换为
-- 用户在仪表板上选择的日期范围
-- -------------------------------------------------------
SELECT
DATE(order_date) AS order_day, -- 按天聚合
COUNT(*) AS order_count, -- 每日订单数
SUM(order_amount) AS daily_revenue, -- 每日收入
AVG(order_amount) AS avg_order_value -- 每日客单价
FROM
orders
WHERE
-- 如果用户设置了日期范围筛选,则使用该范围
-- 否则默认查询最近 30 天
order_date >= COALESCE(
'{{ from_dttm }}', -- Jinja2 变量:开始时间
CURRENT_DATE - INTERVAL '30 days' -- 默认值:30天前
)
AND order_date < COALESCE(
'{{ to_dttm }}', -- Jinja2 变量:结束时间
CURRENT_DATE -- 默认值:今天
)
GROUP BY
DATE(order_date)
ORDER BY
order_day;
-- -------------------------------------------------------
-- 案例 2: 使用自定义 Jinja2 过滤器
-- {{ filter_values('column_name') }} 获取用户在仪表板
-- 筛选器中选择的值
-- -------------------------------------------------------
-- 假设仪表板上有一个 "客户区域" 的筛选器
-- 用户选择了 ["华东", "华南"]
SELECT
customer_region,
SUM(order_amount) AS total_revenue,
COUNT(DISTINCT customer_id) AS customer_count
FROM
orders
WHERE
-- filter_values 返回一个列表,用 IN 匹配
-- 如果用户没有选择任何值,则查询全部数据
{% if filter_values('customer_region') %}
customer_region IN {{ filter_values('customer_region') | where_in }}
{% else %}
1 = 1 -- 永真条件,查询全部
{% endif %}
GROUP BY
customer_region
ORDER BY
total_revenue DESC;
-- -------------------------------------------------------
-- 案例 3: 使用自定义 Jinja2 宏(Macro)
-- 实现可复用的 SQL 逻辑
-- -------------------------------------------------------
-- 先在 Superset 的 macros 目录下定义自定义宏:
-- 文件路径: superset/jinja_context.py
--
-- def current_user_id():
-- """返回当前登录用户的 ID"""
-- from flask import g
-- return g.user.id if hasattr(g, 'user') and g.user else None
--
-- def region_filter(region_column):
-- """根据用户权限动态生成区域过滤条件"""
-- user_id = current_user_id()
-- return f"""
-- {region_column} IN (
-- SELECT region FROM user_region_access
-- WHERE user_id = {user_id}
-- )
-- """
-- 在 SQL 中使用自定义宏
SELECT
product_category,
SUM(order_amount) AS revenue
FROM
orders
WHERE
{{ region_filter('customer_region') }} -- 调用自定义宏
GROUP BY
product_category;
-- -------------------------------------------------------
-- 案例 4: 动态 GROUP BY 列(高级模板)
-- 根据参数动态切换分组维度
-- -------------------------------------------------------
-- 假设通过 URL 参数传递分组维度
-- 例如: ?dimension=product_category
{% set group_dimension = request.args.get('dimension', 'customer_region') %}
SELECT
{{ group_dimension }} AS dimension_value, -- 动态列名
COUNT(*) AS record_count,
SUM(order_amount) AS total_amount,
ROUND(AVG(order_amount), 2) AS avg_amount,
MIN(order_date) AS first_order_date,
MAX(order_date) AS last_order_date
FROM
orders
GROUP BY
{{ group_dimension }}
ORDER BY
total_amount DESC
LIMIT 20;
八、创建图表(Charts)
8.1 折线图(Line Chart)
sql
-- ============================================================
-- 折线图:展示月度销售趋势
-- 在 Superset 中创建步骤:
-- 1. 点击 "+" → "Chart"
-- 2. 选择数据集 "orders"
-- 3. 图表类型选择 "Line Chart"
-- 4. 配置以下参数
-- ============================================================
-- SQL 查询(用于 SQL Lab 验证数据)
SELECT
-- 截取年月作为 X 轴
DATE_TRUNC('month', order_date) AS month,
-- Y 轴指标 1:月度收入
SUM(order_amount) AS monthly_revenue,
-- Y 轴指标 2:订单数量
COUNT(order_id) AS order_count,
-- Y 轴指标 3:客单价
ROUND(SUM(order_amount) / COUNT(order_id), 2) AS avg_order_value
FROM
orders
WHERE
order_date >= '2023-01-01'
AND order_date < '2024-01-01'
GROUP BY
DATE_TRUNC('month', order_date)
ORDER BY
month ASC;
python
# ============================================================
# 通过 API 创建折线图切片(Slice)
# ============================================================
chart_payload = {
"slice_name": "月度销售趋势", # 图表标题
"viz_type": "echarts_timeseries", # 图表类型:ECharts 时序图
"datasource_id": 1, # 数据集 ID
"datasource_type": "table", # 数据源类型:物理表
"params": json.dumps({
# --- 数据配置 ---
"datasource": "1__table", # 数据源标识
# X 轴:时间列
"x_axis": "order_date",
# X 轴时间粒度:月
"time_grain_sqla": "P1M", # ISO 8601 期间:1 Month
# Y 轴指标列表
"metrics": [
{
"expressionType": "SIMPLE", # 简单表达式
"column": {"column_name": "order_amount"},
"aggregate": "SUM", # 聚合方式:求和
"label": "总金额",
},
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_id"},
"aggregate": "COUNT",
"label": "订单数",
},
],
# 分组维度(多系列)
"groupby": ["customer_region"], # 按区域分组,每条线代表一个区域
# 排序
"orderby": [["order_date", True]], # 按日期升序
# 行数限制
"row_limit": 10000,
# --- 样式配置 ---
"show_legend": True, # 显示图例
"legendType": "scroll", # 图例类型:可滚动(分类多时使用)
"legendOrientation": "top", # 图例位置:顶部
"rich_tooltip": True, # 富文本悬浮提示
"tooltipTimeFormat": "%Y-%m", # 提示框中的时间格式
"x_axis_time_format": "%Y-%m", # X 轴时间显示格式
# 平滑曲线
"smooth": True,
# 是否显示数据标记点
"showMarker": False,
# 面积图模式:None, 普通面积, 堆叠面积
"area": "None",
# --- 高级配置 ---
# 是否截断 Y 轴从 0 开始
"truncateYAxis": False,
# Y 轴格式化(金额格式)
"y_axis_format": ",.2f", # 千分位 + 2位小数
}),
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/chart/",
headers=api_headers,
data=json.dumps(chart_payload),
)
if response.status_code == 201:
chart_id = response.json()["id"]
print(f"折线图创建成功!图表 ID: {chart_id}")
8.2 透视表(Pivot Table)
sql
-- ============================================================
-- 透视表:展示区域 × 产品类别的交叉分析
-- ============================================================
-- SQL Lab 验证查询
SELECT
customer_region, -- 行维度:区域
product_category, -- 列维度:产品类别
SUM(order_amount) AS total_revenue, -- 度量:总收入
COUNT(order_id) AS order_count, -- 度量:订单数
ROUND(AVG(order_amount), 2) AS avg_revenue -- 度量:平均收入
FROM
orders o
JOIN
products p ON o.product_id = p.product_id
WHERE
order_date >= '2024-01-01'
GROUP BY
customer_region,
product_category
ORDER BY
customer_region,
total_revenue DESC;
python
# ============================================================
# 通过 API 创建透视表
# ============================================================
pivot_chart_payload = {
"slice_name": "区域-产品类别交叉分析",
"viz_type": "pivot_table_v2", # 图表类型:透视表 V2
"datasource_id": 1,
"datasource_type": "table",
"params": json.dumps({
"datasource": "1__table",
# --- 行维度配置 ---
"groupbyRows": [
"customer_region" # 透视表的行:客户区域
],
# --- 列维度配置 ---
"groupbyColumns": [
"product_category" # 透视表的列:产品类别
],
# --- 度量配置 ---
"metrics": [
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "总收入",
}
],
# --- 透视表高级选项 ---
# 是否显示行汇总(每行的合计列)
"rowTotals": True,
# 是否显示列汇总(每列的合计行)
"colTotals": True,
# 值格式化
"valueFormat": ",.0f", # 千分位整数格式
# 条件格式:数据条(在单元格中显示条形图)
"conditional_formatting": [
{
"column": "总收入",
"operator": ">",
"value": 100000,
"colorScheme": "green", # 大于 10 万显示绿色
},
{
"column": "总收入",
"operator": "<",
"value": 10000,
"colorScheme": "red", # 小于 1 万显示红色
},
],
# 时间范围
"time_range": "2024-01-01 : 2024-12-31",
}),
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/chart/",
headers=api_headers,
data=json.dumps(pivot_chart_payload),
)
print(f"透视表创建结果: {response.status_code}")
8.3 标记图 / 散点图(Scatter Plot)
sql
-- ============================================================
-- 散点图:展示订单金额与利润率的关系
-- 每个点代表一个客户,气泡大小代表订单数量
-- ============================================================
SELECT
customer_id,
customer_name,
AVG(order_amount) AS avg_order_amount, -- 平均订单金额(X轴)
SUM(profit) / SUM(revenue) * 100 AS profit_margin, -- 利润率(Y轴)
COUNT(order_id) AS order_count, -- 订单数(气泡大小)
customer_region -- 区域(颜色分组)
FROM
orders o
JOIN
customers c ON o.customer_id = c.customer_id
GROUP BY
customer_id, customer_name, customer_region
HAVING
COUNT(order_id) >= 5 -- 至少 5 个订单才有统计意义
ORDER BY
avg_order_amount DESC;
python
# ============================================================
# 通过 API 创建散点图
# ============================================================
scatter_chart_payload = {
"slice_name": "客户金额-利润率分析",
"viz_type": "bubble", # 图表类型:气泡图
"datasource_id": 1,
"datasource_type": "table",
"params": json.dumps({
"datasource": "1__table",
# X 轴:平均订单金额
"x": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "AVG",
"label": "平均订单金额",
},
# Y 轴:平均利润率
"y": {
"expressionType": "SIMPLE",
"column": {"column_name": "profit_margin"},
"aggregate": "AVG",
"label": "利润率(%)",
},
# 气泡大小:订单数量
"size": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_id"},
"aggregate": "COUNT",
"label": "订单数",
},
# 系列(颜色分组)
"series": "customer_region",
# 最大气泡大小
"max_bubble_size": "50",
# X 轴格式
"x_axis_format": ",.0f",
# Y 轴格式
"y_axis_format": ",.1f",
# 行数限制
"row_limit": 500,
}),
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/chart/",
headers=api_headers,
data=json.dumps(scatter_chart_payload),
)
print(f"散点图创建结果: {response.status_code}")
九、仪表板(Dashboard)管理
9.1 创建和配置仪表板
python
# ============================================================
# create_dashboard.py
# 通过 API 创建仪表板并添加图表
# ============================================================
# ============================================================
# 步骤 1: 创建空白仪表板
# ============================================================
dashboard_payload = {
"dashboard_title": "销售分析驾驶舱", # 仪表板标题
"slug": "sales-analytics", # URL 别名(英文,用于嵌入链接)
"published": True, # 发布状态(True 表示所有人可见)
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/dashboard/",
headers=api_headers,
data=json.dumps(dashboard_payload),
)
if response.status_code == 201:
dashboard_id = response.json()["id"]
print(f"仪表板创建成功!ID: {dashboard_id}")
# ============================================================
# 步骤 2: 配置仪表板布局(Position JSON)
# ============================================================
# Superset 仪表板使用 JSON 结构描述布局
# 核心概念:
# - ROOT_ID: 根节点
# - GRID_ID: 网格容器
# - ROW_ID: 行容器
# - CHART_ID: 图表组件
# - DASHBOARD_HEADER_ID: 顶部标题栏
position_json = json.dumps({
# --- 仪表板元信息 ---
"DASHBOARD_VERSION_KEY": "v2", # 布局版本
# --- 根节点 ---
"ROOT_ID": {
"type": "ROOT",
"id": "ROOT_ID",
"children": ["GRID_ID"], # 根节点下是网格容器
},
# --- 网格容器 ---
"GRID_ID": {
"type": "GRID",
"id": "GRID_ID",
"children": ["ROW-1", "ROW-2"], # 包含两行
"parents": ["ROOT_ID"],
},
# --- 标题栏 ---
"DASHBOARD_HEADER_ID": {
"type": "HEADER",
"id": "DASHBOARD_HEADER_ID",
"meta": {
"text": "销售分析驾驶舱",
},
},
# --- 第一行:KPI 指标卡片 ---
"ROW-1": {
"type": "ROW",
"id": "ROW-1",
"children": ["CHART-1", "CHART-2", "CHART-3", "CHART-4"],
"parents": ["ROOT_ID", "GRID_ID"],
"meta": {
"background": "BACKGROUND_TRANSPARENT", # 透明背景
},
},
# KPI 卡片 1: 总收入
"CHART-1": {
"type": "CHART",
"id": "CHART-1",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-1"],
"meta": {
"chartId": 1, # 关联的图表 ID
"width": 3, # 占 3/12 列宽(25%)
"height": 50, # 高度 50 像素
"sliceName": "总收入",
},
},
# KPI 卡片 2: 订单数
"CHART-2": {
"type": "CHART",
"id": "CHART-2",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-1"],
"meta": {
"chartId": 2,
"width": 3,
"height": 50,
"sliceName": "总订单数",
},
},
# KPI 卡片 3: 客单价
"CHART-3": {
"type": "CHART",
"id": "CHART-3",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-1"],
"meta": {
"chartId": 3,
"width": 3,
"height": 50,
"sliceName": "平均客单价",
},
},
# KPI 卡片 4: 客户数
"CHART-4": {
"type": "CHART",
"id": "CHART-4",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-1"],
"meta": {
"chartId": 4,
"width": 3,
"height": 50,
"sliceName": "活跃客户数",
},
},
# --- 第二行:主图表区域 ---
"ROW-2": {
"type": "ROW",
"id": "ROW-2",
"children": ["CHART-5", "CHART-6"],
"parents": ["ROOT_ID", "GRID_ID"],
},
# 左侧:折线图(月度趋势)
"CHART-5": {
"type": "CHART",
"id": "CHART-5",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-2"],
"meta": {
"chartId": 5,
"width": 8, # 占 8/12 列宽(66.7%)
"height": 80,
"sliceName": "月度销售趋势",
},
},
# 右侧:饼图(区域分布)
"CHART-6": {
"type": "CHART",
"id": "CHART-6",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-2"],
"meta": {
"chartId": 6,
"width": 4, # 占 4/12 列宽(33.3%)
"height": 80,
"sliceName": "区域销售分布",
},
},
})
# ============================================================
# 步骤 3: 更新仪表板(包含布局)
# ============================================================
update_payload = {
"position_json": position_json,
"json_metadata": json.dumps({
# 原生筛选器配置
"native_filter_configuration": [
{
"id": "filter-region", # 筛选器唯一标识
"name": "客户区域", # 筛选器显示名称
"type": "filter_select", # 筛选器类型:下拉选择
"targets": [
{
"datasetId": 1, # 关联的数据集 ID
"column": {
"name": "customer_region", # 关联的列名
},
}
],
# 默认值
"defaultDataMask": {
"extraFormData": {
"filters": [
{
"col": "customer_region",
"op": "IN",
"val": ["华东", "华南"], # 默认筛选
}
]
}
},
# 筛选器在仪表板中的排列方式
"scope": {
"rootPath": ["ROOT_ID"],
"excluded": [], # 不排除任何图表
},
},
{
"id": "filter-date",
"name": "日期范围",
"type": "filter_time", # 时间范围筛选器
"targets": [{}],
"defaultDataMask": {
"extraFormData": {
"time_range": "Last 30 days", # 默认最近30天
}
},
},
],
# 全局颜色方案
"color_scheme": "supersetColors",
# 刷新频率(秒),0 表示不自动刷新
"refresh_frequency": 300,
# 默认标签页
"default_filters": "{}",
}),
}
response = requests.put(
f"{SUPERSET_URL}/api/v1/dashboard/{dashboard_id}",
headers=api_headers,
data=json.dumps(update_payload),
)
print(f"仪表板更新结果: {response.status_code}")
9.2 发布与权限管理
python
# ============================================================
# dashboard_publish.py
# 仪表板的发布、收藏和访问权限管理
# ============================================================
# ============================================================
# 1. 发布仪表板(使其对其他用户可见)
# ============================================================
publish_payload = {
"published": True, # 发布状态
# slug: 用于生成分享链接
# 访问地址: http://superset:8088/superset/dashboard/sales-analytics/
"slug": "sales-analytics",
}
response = requests.put(
f"{SUPERSET_URL}/api/v1/dashboard/{dashboard_id}",
headers=api_headers,
data=json.dumps(publish_payload),
)
print(f"发布状态: {'成功' if response.status_code == 200 else '失败'}")
# ============================================================
# 2. 将仪表板标记为收藏
# ============================================================
# PUT 请求 toggle 收藏状态
response = requests.put(
f"{SUPERSET_URL}/api/v1/dashboard/{dashboard_id}/favorites/",
headers=api_headers,
)
print(f"收藏操作结果: {response.status_code}")
# ============================================================
# 3. 配置行级安全策略(RLS)
# ============================================================
# 行级安全策略:不同用户只能看到自己权限范围内的数据
# 例如:华东区域的经理只能看到华东区域的数据
rls_payload = {
"name": "华东区域经理只能看华东数据", # 策略名称
"clause": "customer_region = '华东'", # WHERE 子句(自动追加到查询中)
"tables": [1], # 适用的表 ID 列表
"filter_type": "Base", # 基础过滤器
"group_key": "", # 分组键(用于角色级策略)
# roles: 适用于哪些角色
"roles": [
{
"id": 3, # "华东区域经理" 角色 ID
"name": "华东区域经理",
}
],
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/rowlevelsecurity/",
headers=api_headers,
data=json.dumps(rls_payload),
)
print(f"RLS 策略创建结果: {response.status_code}")
十、数据上传功能
10.1 启用 CSV/Excel 上传
python
# ============================================================
# superset_config.py 中启用数据上传功能
# ============================================================
# ============================================================
# 步骤 1: 在 superset_config.py 中添加以下配置
# ============================================================
# 允许上传 CSV 文件
UPLOAD_FOLDER = "/app/superset_home/uploads/" # 上传文件的临时存储目录
# CSV 上传相关配置
CSV_EXTENSIONS = {"csv", "tsv", "txt"} # 允许的文件扩展名
EXCEL_EXTENSIONS = {"xlsx", "xls"} # 允许的 Excel 扩展名
ALLOWED_EXTENSIONS = CSV_EXTENSIONS | EXCEL_EXTENSIONS
# 上传文件最大大小(字节)
# 100MB = 100 * 1024 * 1024
UPLOAD_MAX_SIZE_MB = 100
# 数据库级别允许上传
# 在数据库连接的 "Extra" JSON 中配置:
# {
# "allows_virtual_table_explore": true,
# "allow_csv_upload": true # 关键配置:允许 CSV 上传
# }
10.2 通过 API 上传 CSV 数据
python
# ============================================================
# upload_csv.py
# 通过 API 上传 CSV 文件到 Superset 并创建数据集
# ============================================================
import requests
import os
SUPERSET_URL = "http://localhost:8088"
# ... headers 设置同上 ...
# ============================================================
# 步骤 1: 上传 CSV 文件
# ============================================================
# 要上传的 CSV 文件路径
csv_file_path = "/path/to/sales_data.csv"
# CSV 文件内容示例(sales_data.csv):
# order_id,customer_name,product_category,order_amount,order_date,region
# 1001,张三,电子产品,2500.00,2024-01-15,华东
# 1002,李四,服装鞋帽,380.50,2024-01-16,华南
# 1003,王五,食品饮料,125.80,2024-01-16,华北
# ...
# 使用 multipart/form-data 格式上传文件
# 注意:上传文件时不要手动设置 Content-Type
upload_headers = {
"Authorization": f"Bearer {access_token}",
"Accept": "application/json",
}
# multipart/form-data 请求体
files = {
# file: 文件对象(二进制模式打开)
"file": (
os.path.basename(csv_file_path), # 文件名
open(csv_file_path, "rb"), # 文件内容(二进制读取)
"text/csv", # MIME 类型
),
}
# 额外表单字段
data = {
"tableName": "sales_data_from_csv", # 创建的表名
"databaseId": 1, # 目标数据库 ID
"schema": "public", # 目标 schema
"delimiter": ",", # CSV 分隔符
"already_exists": "append", # 表已存在时的处理方式
# "replace": 替换表 | "append": 追加数据 | "fail": 报错
"headerRow": 0, # 表头行号(0 表示第一行是表头)
"decimalCharacter": ".", # 小数点符号
}
# 发送上传请求
response = requests.post(
f"{SUPERSET_URL}/api/v1/dataset/import/",
headers=upload_headers,
files=files, # 文件数据
data=data, # 表单字段
)
if response.status_code == 200:
result = response.json()
dataset_id = result.get("id")
print(f"CSV 上传成功!自动创建的数据集 ID: {dataset_id}")
else:
print(f"上传失败: {response.status_code} - {response.text}")
十一、高级分析功能
11.1 滚动平均值(Rolling Average)
sql
-- ============================================================
-- 滚动平均值:消除数据波动,观察长期趋势
-- 在 Superset 图表中配置:
-- Analytics → Rolling Function → mean
-- Rolling Periods → 7 (7天滚动平均)
-- ============================================================
-- 方法 1: 在 SQL 中直接计算 7 天滚动平均
SELECT
order_date,
daily_revenue,
-- 窗口函数计算 7 天滚动平均
-- ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
-- 表示取当前行及前 6 行(共 7 行)的平均值
ROUND(
AVG(daily_revenue) OVER (
ORDER BY order_date -- 按日期排序
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW -- 窗口范围:前6行+当前行
),
2
) AS rolling_avg_7d,
-- 30 天滚动平均
ROUND(
AVG(daily_revenue) OVER (
ORDER BY order_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW -- 前29行+当前行 = 30天
),
2
) AS rolling_avg_30d,
-- 滚动总和(7 天累计)
SUM(daily_revenue) OVER (
ORDER BY order_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS rolling_sum_7d
FROM (
-- 子查询:先按天聚合每日收入
SELECT
DATE(order_date) AS order_date,
SUM(order_amount) AS daily_revenue
FROM
orders
WHERE
order_date >= '2024-01-01'
GROUP BY
DATE(order_date)
) daily_stats
ORDER BY
order_date;
python
# ============================================================
# 通过 API 配置图表的滚动平均分析
# ============================================================
rolling_avg_chart = {
"slice_name": "日销售额趋势(含7日/30日滚动平均)",
"viz_type": "echarts_timeseries",
"datasource_id": 1,
"datasource_type": "table",
"params": json.dumps({
"datasource": "1__table",
"x_axis": "order_date",
"time_grain_sqla": "P1D", # 每日粒度
"metrics": [
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "每日收入",
},
],
"groupby": [],
"row_limit": 10000,
# --- 高级分析配置 ---
# 这是 Superset 内置的 Advanced Analytics 功能
"advanced_analytics": {
# 滚动窗口配置
"rolling_type": "mean", # 滚动函数:mean(平均值)
"rolling_periods": 7, # 滚动窗口大小:7 个周期
"min_periods": 1, # 最少需要几个周期才开始计算
# 可以配置多条线:原始值 + 滚动平均
},
# 叠加第二条线:30 日滚动平均
# 在 Superset UI 中通过 "Advanced Analytics" 面板配置
"secondary_metric": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "30日滚动平均",
},
# 样式
"show_legend": True,
"rich_tooltip": True,
"y_axis_format": ",.0f",
"x_axis_time_format": "%m-%d",
"time_range": "Last 90 days",
}),
}
11.2 时间比较(Time Comparison)
sql
-- ============================================================
-- 时间比较:同比(YoY)和环比(MoM)分析
-- ============================================================
-- 案例 1: 月度同比分析(本月 vs 去年同月)
SELECT
-- 当前月份
DATE_TRUNC('month', order_date) AS month,
-- 当前月份的收入
SUM(order_amount) AS current_revenue,
-- 去年同月的收入(使用 LAG 窗口函数偏移 12 行,假设数据按月排列)
LAG(SUM(order_amount), 12) OVER (
ORDER BY DATE_TRUNC('month', order_date)
) AS last_year_revenue,
-- 同比增长率 = (当前 - 去年同期) / 去年同期 * 100
ROUND(
(SUM(order_amount) - LAG(SUM(order_amount), 12) OVER (
ORDER BY DATE_TRUNC('month', order_date)
)) * 100.0 / LAG(SUM(order_amount), 12) OVER (
ORDER BY DATE_TRUNC('month', order_date)
),
2
) AS yoy_growth_pct
FROM
orders
WHERE
order_date >= '2022-01-01'
GROUP BY
DATE_TRUNC('month', order_date)
ORDER BY
month;
-- 案例 2: 周度环比分析(本周 vs 上周)
SELECT
DATE_TRUNC('week', order_date) AS week,
SUM(order_amount) AS current_week_revenue,
-- 上周收入(偏移 1 行)
LAG(SUM(order_amount), 1) OVER (
ORDER BY DATE_TRUNC('week', order_date)
) AS last_week_revenue,
-- 环比增长率
ROUND(
(SUM(order_amount) - LAG(SUM(order_amount), 1) OVER (
ORDER BY DATE_TRUNC('week', order_date)
)) * 100.0 / NULLIF(LAG(SUM(order_amount), 1) OVER (
ORDER BY DATE_TRUNC('week', order_date)
), 0), -- 使用 NULLIF 防止除零错误
2
) AS wow_growth_pct
FROM
orders
WHERE
order_date >= '2024-01-01'
GROUP BY
DATE_TRUNC('week', order_date)
ORDER BY
week;
python
# ============================================================
# Superset 内置时间比较配置
# ============================================================
# Superset Advanced Analytics 提供内置的时间偏移功能
# 无需手写 SQL,直接在图表配置中设置
time_comparison_config = {
"slice_name": "月度收入同比分析",
"viz_type": "echarts_timeseries",
"datasource_id": 1,
"datasource_type": "table",
"params": json.dumps({
"datasource": "1__table",
"x_axis": "order_date",
"time_grain_sqla": "P1M", # 月度粒度
"metrics": [
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "月度收入",
},
],
# --- 时间偏移配置(Time Comparison)---
# Superset 会自动计算并叠加显示偏移后的数据线
"time_compare": [
"1 year ago", # 同比:1 年前的同期数据
"1 month ago", # 环比:1 个月前的同期数据
],
# 比较类型
# "values": 显示原始值(两条线)
# "absolute": 显示差异绝对值
# "percentage": 显示百分比变化
# "ratio": 显示比值
"comparison_type": "values",
"show_legend": True,
"rich_tooltip": True,
"y_axis_format": ",.0f",
}),
}
11.3 数据重采样(Resampling)
sql
-- ============================================================
-- 数据重采样:改变时间序列的采样频率
-- 下采样(降频):如 日 → 月
-- 上采样(升频):如 月 → 日(需要插值填充)
-- ============================================================
-- 案例 1: 下采样 --- 日数据聚合为月数据
-- 原始数据粒度:每天一条记录
-- 目标粒度:每月一条记录
SELECT
DATE_TRUNC('month', order_date) AS month, -- 截断到月份
-- 下采样时的聚合方式
SUM(order_amount) AS monthly_total, -- 月度总收入
AVG(order_amount) AS monthly_avg, -- 月度平均订单金额
COUNT(*) AS monthly_orders, -- 月度订单数
MIN(order_date) AS month_start, -- 月首日期
MAX(order_date) AS month_end, -- 月末日期
-- 标准差:衡量数据波动性
STDDEV(order_amount) AS volatility
FROM
orders
WHERE
order_date >= '2023-01-01'
GROUP BY
DATE_TRUNC('month', order_date)
ORDER BY
month;
-- 案例 2: 上采样 --- 周数据展开为日数据(使用 generate_series)
-- 生成连续日期序列,再用 LEFT JOIN 填充缺失日期的数据
WITH weekly_data AS (
-- 原始周度数据
SELECT
DATE_TRUNC('week', order_date) AS week_start,
SUM(order_amount) AS weekly_revenue,
COUNT(*) AS weekly_orders
FROM
orders
GROUP BY
DATE_TRUNC('week', order_date)
),
date_series AS (
-- 生成连续的日期序列
SELECT
generate_series(
'2024-01-01'::date, -- 开始日期
'2024-12-31'::date, -- 结束日期
'1 day'::interval -- 步长:1 天
)::date AS date
)
SELECT
ds.date, -- 连续的日期
wd.weekly_revenue, -- 周度收入(重复填充)
wd.weekly_orders, -- 周度订单数
-- 前向填充(Forward Fill):用最近的非空值填充
COALESCE(
wd.weekly_revenue,
-- LAG 函数取最近的非空值
LAG(wd.weekly_revenue IGNORE NULLS, 1) OVER (ORDER BY ds.date)
) AS filled_revenue
FROM
date_series ds
LEFT JOIN
weekly_data wd ON ds.date = wd.week_start
ORDER BY
ds.date;
python
# ============================================================
# Superset 内置数据重采样配置
# ============================================================
resample_config = {
"slice_name": "月度销售趋势(重采样)",
"viz_type": "echarts_timeseries",
"datasource_id": 1,
"datasource_type": "table",
"params": json.dumps({
"datasource": "1__table",
"x_axis": "order_date",
# 时间粒度:从日粒度调整为月粒度
"time_grain_sqla": "P1M",
"metrics": [
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "月度收入",
},
],
# --- 重采样配置(Resample)---
"resample_method": "asfreq", # 重采样方法
# 可选值:
# "asfreq": 原始频率(不做聚合)
# "mean": 均值
# "sum": 求和
# "ffill": 前向填充(Forward Fill)
# "bfill": 后向填充(Backward Fill)
# "zero": 用零填充缺失值
# "linear": 线性插值
# 重采样的目标频率
"resample_rule": "1M", # 1M = 1 个月
# 可选值:
# "1D" 日
# "1W" 周
# "1M" 月
# "1Q" 季度
# "1Y" 年
# "1H" 小时
# "1T" 分钟
# 填充方法(用于上采样时的缺失值填充)
"resample_fillmethod": "ffill", # 前向填充
# 可选值:
# "ffill": 用前一个有效值填充
# "bfill": 用后一个有效值填充
# None/null: 不填充(保留 NaN)
"show_legend": True,
"y_axis_format": ",.0f",
}),
}
十二、缓存与性能优化
12.1 多层缓存配置
python
# ============================================================
# advanced_caching.py
# Superset 的多层缓存策略配置
# ============================================================
import json
from functools import partial
# ============================================================
# 1. 数据源缓存(Data Cache)--- 缓存查询结果
# ============================================================
# 当多个用户访问同一个图表时,直接返回缓存的查询结果
# 避免重复执行 SQL 查询数据库
DATA_CACHE_CONFIG = {
"CACHE_TYPE": "RedisCache",
"CACHE_DEFAULT_TIMEOUT": 600, # 默认缓存 10 分钟
"CACHE_KEY_PREFIX": "superset_data_",
"CACHE_REDIS_HOST": "localhost",
"CACHE_REDIS_PORT": 6379,
"CACHE_REDIS_DB": 2,
}
# ============================================================
# 2. 元数据缓存(Metadata Cache)--- 缓存表名、列名等
# ============================================================
# 避免每次打开数据集选择器时都查询数据库的 metadata
CACHE_CONFIG = {
"CACHE_TYPE": "RedisCache",
"CACHE_DEFAULT_TIMEOUT": 300, # 缓存 5 分钟
"CACHE_KEY_PREFIX": "superset_meta_",
"CACHE_REDIS_HOST": "localhost",
"CACHE_REDIS_PORT": 6379,
"CACHE_REDIS_DB": 1,
}
# ============================================================
# 3. 缓存预热(Cache Warming)
# ============================================================
# 通过 Celery Beat 定时任务预加载常用仪表板的缓存
# 确保用户打开仪表板时能快速响应
# 在 CeleryConfig 中添加缓存预热定时任务
CELERY_BEAT_SCHEDULE = {
# 每 15 分钟预热一次核心仪表板缓存
"cache-warmup-core-dashboards": {
"task": "superset.tasks.cache.warm_up_dashboards",
"schedule": 900, # 15 分钟 = 900 秒
"kwargs": {
# 指定需要预热的仪表板 ID 列表
"dashboard_ids": [1, 2, 3],
# 筛选条件(可选)
"force": True, # 强制刷新,不使用已有缓存
},
},
# 每小时预热一次报表图表
"cache-warmup-report-charts": {
"task": "superset.tasks.cache.warm_up_slices",
"schedule": 3600, # 1 小时
"kwargs": {
"slice_ids": [10, 11, 12, 15, 20],
},
},
}
# ============================================================
# 4. 查询结果缓存的自定义缓存键
# ============================================================
# 默认情况下,缓存键由 SQL 查询语句的哈希值决定
# 如果需要更精细的控制(如基于用户身份生成不同缓存),可自定义
def custom_cache_key(*args, **kwargs):
"""
自定义缓存键生成函数
将用户角色、筛选条件等因素纳入缓存键
确保不同权限的用户看到不同的缓存数据
"""
from flask import g
# 获取当前用户的角色
user_role = getattr(g, 'user_role', 'anonymous')
# 生成包含用户角色的缓存键
cache_key = f"superset_{user_role}_{hash(str(args) + str(kwargs))}"
return cache_key
12.2 Celery 异步查询配置
python
# ============================================================
# celery_async_query.py
# Celery 异步查询配置:允许长时间运行的 SQL 不阻塞 Web 服务器
# ============================================================
# ============================================================
# 1. superset_config.py 中的 Celery 配置
# ============================================================
# 使用 CeleryConfig 类(见前文),确保以下配置正确
class CeleryConfig:
# Broker: Redis 作为消息中间件
broker_url = "redis://localhost:6379/0"
# Result Backend: 存储任务执行结果
result_backend = "redis://localhost:6379/0"
# 序列化
task_serializer = "json"
result_serializer = "json"
accept_content = ["json"]
# 时区
timezone = "Asia/Shanghai"
enable_utc = True
# Worker 并发数(根据 CPU 核心数调整)
worker_concurrency = 8
# Worker 预取任务数
worker_prefetch_multiplier = 1
# 任务软超时(秒):超时后 Worker 会尝试优雅退出
task_soft_time_limit = 600 # 10 分钟
# 任务硬超时(秒):超时后 Worker 直接终止
task_time_limit = 1200 # 20 分钟
CELERY_CONFIG = CeleryConfig
# ============================================================
# 2. 数据库连接配置中的异步开关
# ============================================================
# 在注册数据库时,通过 allow_run_async 开启异步查询
async_db_config = {
"database_name": "big_query_warehouse",
"sqlalchemy_uri": "presto://user@presto-cluster:8080/hive",
"allow_run_async": True, # 开启异步查询
"allow_ctas": True,
"allow_cvas": True,
"allow_dml": False,
"extra": json.dumps({
# 异步查询的超时时间(毫秒)
"timeout": 600000, # 600 秒 = 10 分钟
# 允许在 SQL Lab 中运行异步查询
"run_async": True,
}),
}
十三、警报与报表(Alerts & Reports)
13.1 配置定时报表
python
# ============================================================
# alerts_reports.py
# Superset 的警报和报表推送功能配置
# ============================================================
# ============================================================
# 1. 基础配置(在 superset_config.py 中)
# ============================================================
# 启用警报和报表功能
FEATURE_FLAGS = {
"ALERT_REPORTS": True, # 启用功能开关
}
# SMTP 邮件配置(见前文配置章节)
SMTP_HOST = "smtp.example.com"
SMTP_PORT = 587
SMTP_STARTTLS = True
SMTP_MAIL_FROM = "superset@company.com"
# ============================================================
# 2. 通过 API 创建警报规则
# ============================================================
alert_payload = {
"name": "库存预警 - 低于安全库存", # 警报名称
"description": "当产品库存低于安全阈值时触发邮件通知",
"type": "Alert", # 类型:Alert(警报)
"active": True, # 是否激活
# --- 数据源配置 ---
"database": 1, # 数据库 ID
"sql": ( # 警报查询 SQL
"SELECT COUNT(*) AS low_stock_count "
"FROM products "
"WHERE current_stock < safety_stock "
"AND category IN ('电子产品', '食品饮料')"
),
# --- 触发条件 ---
"validator_type": "operator", # 验证器类型:运算符
"validator_config_json": json.dumps({
"operator": ">", # 运算符
"threshold": 0, # 阈值:当结果 > 0 时触发
}),
# --- 通知配置 ---
"recipients": [
{
"type": "email", # 通知方式:邮件
"recipient_config_json": json.dumps({
"target": "warehouse-manager@company.com; inventory@company.com",
}),
},
],
# --- 执行频率 ---
"crontab": "0 */2 * * *", # 每 2 小时执行一次
# crontab 格式:分 时 日 月 周
# "*/15 * * * *": 每 15 分钟
# "0 9 * * 1-5": 工作日每天上午 9 点
# "0 0 * * 1": 每周一 0 点
# "0 8 1 * *": 每月 1 号上午 8 点
# --- 邮件正文模板 ---
"message_template": (
"⚠️ 库存预警通知\n\n"
"当前有 {{ value }} 个产品库存低于安全阈值。\n"
"请尽快检查并补货。\n\n"
"数据查询时间: {{ now }}\n"
"查看详情: {{ url }}"
),
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/report/",
headers=api_headers,
data=json.dumps(alert_payload),
)
print(f"警报创建结果: {response.status_code}")
# ============================================================
# 3. 创建定时仪表板截图报表
# ============================================================
report_payload = {
"name": "每日销售日报", # 报表名称
"description": "每天上午 9 点发送销售日报截图",
"type": "Report", # 类型:Report(报表)
"active": True,
# --- 报表内容 ---
"dashboard": 1, # 仪表板 ID
# 截图设置
"extra_params": json.dumps({
"dashboard_id": 1,
# 筛选条件(覆盖仪表板默认筛选)
"filters": [
{
"col": "order_date",
"op": "TEMPORAL_RANGE",
"val": "Yesterday", # 只包含昨天的数据
}
],
}),
# --- 通知配置 ---
"recipients": [
{
"type": "email",
"recipient_config_json": json.dumps({
"target": "sales-team@company.com",
}),
},
# 可以同时发送到 Slack
{
"type": "Slack",
"recipient_config_json": json.dumps({
"target": "#sales-reports", # Slack 频道名
}),
},
],
# --- 执行频率:工作日上午 9 点 ---
"crontab": "0 9 * * 1-5",
"message_template": "📊 销售日报已生成,请查收。",
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/report/",
headers=api_headers,
data=json.dumps(report_payload),
)
print(f"报表创建结果: {response.status_code}")
十四、主题自定义
14.1 自定义 Superset 主题
python
# ============================================================
# superset_theme.py
# 自定义 Superset 的视觉主题(颜色、字体等)
# 在 superset_config.py 中指定自定义主题
# ============================================================
# Superset 主题是一个嵌套的 JavaScript 对象(JSON 格式)
# 定义了整个应用的色彩系统、间距、字体等
THEME_OVERRIDES = {
# --- 色彩系统 ---
"colors": {
# 主色调(品牌色)
"primary": {
"base": "#1890ff", # 主色:蓝色
"dark1": "#096dd9", # 深色变体
"dark2": "#0050b3", # 更深的变体
"light1": "#69c0ff", # 浅色变体
"light2": "#91d5ff", # 更浅的变体
"light3": "#bae7ff", # 最浅的变体
},
# 次要色(强调色)
"secondary": {
"base": "#722ed1", # 紫色
"dark1": "#531dab",
"dark2": "#391085",
"light1": "#b37feb",
"light2": "#d3adf7",
},
# 灰度色系
"grayscale": {
"base": "#666666", # 基准灰
"dark1": "#444444",
"dark2": "#333333",
"dark3": "#222222",
"light1": "#888888",
"light2": "#aaaaaa",
"light3": "#cccccc",
"light4": "#e8e8e8",
"light5": "#f5f5f5",
},
# 语义色
"error": {
"base": "#f5222d", # 错误:红色
},
"warning": {
"base": "#fa8c16", # 警告:橙色
},
"success": {
"base": "#52c41a", # 成功:绿色
},
"info": {
"base": "#1890ff", # 信息:蓝色
},
# --- 图表调色板(最多 20 种颜色,用于多系列数据)---
"sequential": [
"#1890ff", # 蓝色
"#2fc25b", # 绿色
"#facc14", # 黄色
"#f04864", # 红色
"#8543e0", # 紫色
"#13c2c2", # 青色
"#fa8c16", # 橙色
"#a0d911", # 青绿色
"#1d39c4", # 深蓝色
"#eb2f96", # 玫红色
],
# 分类调色板(用于分类数据)
"categorical": [
"#1fa8c9", # 青色
"#454e7c", # 深灰蓝
"#5ac189", # 绿色
"#ff7f44", # 橙色
"#666666", # 灰色
"#e377c2", # 粉色
"#bcbd22", # 黄绿
"#17becf", # 浅青
"#aec7e8", # 浅蓝
"#ffbb78", # 浅橙
],
},
# --- 字体配置 ---
"typography": {
# 字体族
"fontFamilies": {
"sansSerif": "'Inter', 'Noto Sans SC', 'PingFang SC', sans-serif",
"serif": "'Noto Serif SC', 'Songti SC', serif",
"monospace": "'JetBrains Mono', 'Fira Code', 'Courier New', monospace",
},
# 字号
"fontSizes": {
"tiny": "10px",
"small": "12px",
"base": "14px", # 正文字号
"large": "16px",
"xlarge": "20px",
"xxlarge": "24px",
},
},
# --- 间距系统 ---
"gridUnit": 4, # 基准间距单位(4px)
# --- 边框圆角 ---
"borderRadius": {
"small": "2px",
"medium": "4px",
"large": "8px",
},
# --- 阴影 ---
"shadows": {
"shadowLow": "0 1px 2px rgba(0, 0, 0, 0.1)",
"shadowMedium": "0 4px 8px rgba(0, 0, 0, 0.12)",
"shadowHigh": "0 8px 16px rgba(0, 0, 0, 0.15)",
},
# --- 图表特定配置 ---
"chartProperties": {
# 图表默认背景色
"backgroundColor": "#ffffff",
# 标题字体大小
"headerFontSize": "14px",
# 正文字体大小
"bodyFontSize": "12px",
},
}
十五、地图可视化(Country Map & Geo)
15.1 中国省份地图配置
sql
-- ============================================================
-- 中国省份销售地图:使用 Superset 的 Country Map 可视化
-- ============================================================
-- 查询各省份的销售数据
SELECT
province, -- 省份名称(需要与 Superset 内置映射匹配)
SUM(order_amount) AS total_revenue, -- 省份总收入
COUNT(DISTINCT customer_id) AS customer_count, -- 省份客户数
COUNT(order_id) AS order_count, -- 订单数量
-- 客单价
ROUND(SUM(order_amount) / COUNT(order_id), 2) AS avg_order_value
FROM
orders o
JOIN
customers c ON o.customer_id = c.customer_id
WHERE
order_date >= '2024-01-01'
GROUP BY
province
ORDER BY
total_revenue DESC;
python
# ============================================================
# 配置中国地图切片
# ============================================================
china_map_chart = {
"slice_name": "全国销售热力地图",
"viz_type": "country_map", # 图表类型:国家地图
"datasource_id": 1,
"datasource_type": "table",
"params": json.dumps({
"datasource": "1__table",
# 选择国家代码
"select_country": "china", # 中国地图
# 其他可选国家: "united_states", "japan", "france", 等
# 地理实体列:省份名称
"entity": "province",
# 度量指标:用于地图颜色深浅
"metric": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "总销售额",
},
# 颜色方案
"linear_color_scheme": "superset_seq_1", # Superset 默认序列色
# 颜色范围
"number_format": ",.0f", # 数字格式:千分位整数
# 地图下方图例的刻度数
"steps": 10, # 将颜色分为 10 个等级
# 底图颜色
"country_color_type": "select_country",
# 时间范围
"time_range": "2024-01-01 : 2024-12-31",
# 行级过滤
"adhoc_filters": [
{
"expressionType": "SIMPLE",
"subject": "province",
"operator": "NOT IN",
"comparator": ["台湾"], # 排除某些区域
"clause": "WHERE",
}
],
}),
}
15.2 地理坐标散点地图(Deck.GL)
python
# ============================================================
# 使用 Deck.GL 创建地理坐标散点地图
# 适用于经纬度数据的可视化(如门店分布、配送路径等)
# ============================================================
geo_scatter_chart = {
"slice_name": "全国门店分布地图",
"viz_type": "deck_scatter", # Deck.GL 散点地图
"datasource_id": 2, # 包含经纬度的数据集
"datasource_type": "table",
"params": json.dumps({
"datasource": "2__table",
# 经度列
"longitude": "lng", # 经度字段名
# 纬度列
"latitude": "lat", # 纬度字段名
# 散点半径(根据度量值自动调整大小)
"point_radius_fixed": {
"type": "metric", # 使用指标值动态计算半径
"value": {
"expressionType": "SIMPLE",
"column": {"column_name": "monthly_revenue"},
"aggregate": "SUM",
},
},
# 散点颜色
"color_picker": {
"r": 24, # 红色分量 (0-255)
"g": 144, # 绿色分量
"b": 255, # 蓝色分量
"a": 0.8, # 透明度 (0-1)
},
# 地图初始视图
"viewport": {
"latitude": 35.86, # 中心纬度
"longitude": 104.19, # 中心经度(中国中心)
"zoom": 4, # 缩放级别
"pitch": 0, # 倾斜角度
"bearing": 0, # 旋转角度
},
# 底图样式
"map_style": "mapbox://styles/mapbox/dark-v10", # 暗色底图
# 其他底图选项:
# "mapbox://styles/mapbox/light-v10" 浅色底图
# "mapbox://styles/mapbox/streets-v11" 街道底图
# "mapbox://styles/mapbox/satellite-v9" 卫星底图
# 点大小范围
"min_radius": 2, # 最小半径
"max_radius": 50, # 最大半径
# 行数限制
"row_limit": 10000,
}),
}
十六、导入导出
16.1 导出和导入仪表板资产
bash
# ============================================================
# 通过 CLI 导出/导入 Superset 资产(仪表板、图表、数据集等)
# ============================================================
# --- 导出仪表板为 YAML 文件 ---
# 导出的文件包含:仪表板定义 + 所有关联的图表 + 数据集 + 数据库连接
superset export-dashboards \
--dashboard-file /path/to/export/dashboards_export.yaml \
--dashboard-id 1 # 只导出 ID 为 1 的仪表板
# 导出所有仪表板
superset export-dashboards \
--dashboard-file /path/to/export/all_dashboards.yaml
# --- 导入仪表板 ---
# 导入时会自动创建缺失的数据集和数据库连接
# 但数据库的连接信息需要手动配置(出于安全考虑)
superset import-dashboards \
--path /path/to/export/dashboards_export.yaml \
--username admin # 导入操作使用的用户名
# --- 导出数据集 ---
superset export-datasets \
--dataset-file /path/to/export/datasets.yaml \
--dataset-id 1
# --- 导入数据集 ---
superset import-datasets \
--path /path/to/export/datasets.yaml
python
# ============================================================
# 通过 API 导出仪表板
# ============================================================
# 导出仪表板(返回 ZIP 文件)
response = requests.get(
f"{SUPERSET_URL}/api/v1/dashboard/export/?q=!({dashboard_id})",
headers=api_headers,
stream=True, # 流式下载大文件
)
if response.status_code == 200:
# 将响应内容写入 ZIP 文件
export_path = "/path/to/export/dashboard_export.zip"
with open(export_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"仪表板已导出到: {export_path}")
# ============================================================
# 通过 API 导入仪表板
# ============================================================
# 导入时需要上传 ZIP 文件
import_files = {
"file": (
"dashboard_export.zip",
open("/path/to/export/dashboard_export.zip", "rb"),
"application/zip",
),
}
# 导入选项
import_data = {
"passwords": json.dumps({
# 数据库连接密码映射
# 导出文件中密码字段会被加密,导入时需要提供解密密码
"database://prod_db": "db_password_here",
}),
# 覆盖策略
# "overwrite": 覆盖已存在的同名资产
# "append": 追加(不覆盖)
}
response = requests.post(
f"{SUPERSET_URL}/api/v1/assets/import/",
headers={"Authorization": f"Bearer {access_token}"},
files=import_files,
data=import_data,
)
print(f"导入结果: {response.status_code} - {response.json()}")
十七、完整实战案例 --- 从零构建销售分析仪表板
17.1 端到端自动化脚本
python
#!/usr/bin/env python3
# ============================================================
# build_sales_dashboard.py
# 完整的端到端自动化脚本:
# 1. 登录 Superset
# 2. 注册数据库连接
# 3. 创建数据集
# 4. 创建各类图表
# 5. 组装仪表板
# 6. 发布仪表板
# ============================================================
import requests
import json
import time
# ============================================================
# 全局配置
# ============================================================
SUPERSET_URL = "http://localhost:8088"
ADMIN_USERNAME = "admin"
ADMIN_PASSWORD = "admin123"
def get_access_token(url, username, password):
"""
获取 Superset API 的访问令牌(Access Token)
参数:
url: Superset 服务器地址
username: 登录用户名
password: 登录密码
返回:
str: JWT Access Token 字符串
"""
# 构造登录请求
response = requests.post(
f"{url}/api/v1/security/login",
headers={"Content-Type": "application/json"},
data=json.dumps({
"username": username,
"password": password,
"provider": "db",
"refresh": True,
}),
)
# 检查登录是否成功
response.raise_for_status()
token = response.json()["access_token"]
print(f"[INFO] 登录成功,Token: {token[:30]}...")
return token
def get_api_headers(url, token):
"""
构造包含认证信息的 API 请求头
参数:
url: Superset 服务器地址(用于 CSRF Token)
token: Access Token
返回:
dict: HTTP 请求头字典
"""
# 获取 CSRF Token(POST/PUT/DELETE 请求需要)
csrf_response = requests.get(
f"{url}/api/v1/security/csrf_token/",
headers={"Authorization": f"Bearer {token}"},
)
csrf_token = csrf_response.json()["result"]
return {
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": f"Bearer {token}",
"X-CSRFToken": csrf_token,
"Referer": url,
}
def create_database(url, headers, db_config):
"""
注册数据库连接
参数:
url: Superset URL
headers: API 请求头
db_config: 数据库配置字典
返回:
int: 数据库 ID
"""
response = requests.post(
f"{url}/api/v1/database/",
headers=headers,
data=json.dumps(db_config),
)
if response.status_code == 201:
db_id = response.json()["id"]
print(f"[INFO] 数据库注册成功,ID: {db_id}")
return db_id
elif response.status_code == 422:
# 数据库可能已存在,查询现有数据库列表
print(f"[WARN] 数据库可能已存在: {response.json()}")
resp = requests.get(f"{url}/api/v1/database/", headers=headers)
for db in resp.json()["result"]:
if db["database_name"] == db_config["database_name"]:
print(f"[INFO] 使用已有数据库,ID: {db['id']}")
return db["id"]
raise Exception(f"创建数据库失败: {response.text}")
def create_dataset(url, headers, dataset_config):
"""
注册数据集(从已注册的数据库中选择表)
参数:
url: Superset URL
headers: API 请求头
dataset_config: 数据集配置字典
返回:
int: 数据集 ID
"""
response = requests.post(
f"{url}/api/v1/dataset/",
headers=headers,
data=json.dumps(dataset_config),
)
if response.status_code == 201:
ds_id = response.json()["id"]
print(f"[INFO] 数据集注册成功,ID: {ds_id}")
return ds_id
raise Exception(f"创建数据集失败: {response.text}")
def create_chart(url, headers, chart_config):
"""
创建图表切片
参数:
url: Superset URL
headers: API 请求头
chart_config: 图表配置字典
返回:
int: 图表 ID
"""
response = requests.post(
f"{url}/api/v1/chart/",
headers=headers,
data=json.dumps(chart_config),
)
if response.status_code == 201:
chart_id = response.json()["id"]
print(f"[INFO] 图表 '{chart_config['slice_name']}' 创建成功,ID: {chart_id}")
return chart_id
raise Exception(f"创建图表失败: {response.text}")
def create_dashboard(url, headers, dashboard_config):
"""
创建仪表板
参数:
url: Superset URL
headers: API 请求头
dashboard_config: 仪表板配置字典
返回:
int: 仪表板 ID
"""
response = requests.post(
f"{url}/api/v1/dashboard/",
headers=headers,
data=json.dumps(dashboard_config),
)
if response.status_code == 201:
dash_id = response.json()["id"]
print(f"[INFO] 仪表板创建成功,ID: {dash_id}")
return dash_id
raise Exception(f"创建仪表板失败: {response.text}")
# ============================================================
# 主执行流程
# ============================================================
def main():
"""
主函数:按顺序执行所有步骤
"""
print("=" * 60)
print(" Superset 销售分析仪表板 - 自动化构建脚本")
print("=" * 60)
# --- 步骤 1: 认证 ---
print("\n[Step 1] 登录 Superset...")
token = get_access_token(SUPERSET_URL, ADMIN_USERNAME, ADMIN_PASSWORD)
headers = get_api_headers(SUPERSET_URL, token)
# --- 步骤 2: 注册数据库 ---
print("\n[Step 2] 注册数据库连接...")
db_id = create_database(SUPERSET_URL, headers, {
"database_name": "sales_postgres",
"sqlalchemy_uri": "postgresql+psycopg2://analyst:pass@localhost:5432/sales",
"expose_in_sqllab": True,
"allow_ctas": True,
"allow_cvas": True,
"allow_run_async": True,
})
# --- 步骤 3: 注册数据集 ---
print("\n[Step 3] 注册数据集...")
dataset_id = create_dataset(SUPERSET_URL, headers, {
"database": db_id,
"table_name": "orders",
"schema": "public",
})
# --- 步骤 4: 创建图表 ---
print("\n[Step 4] 创建图表...")
# 图表 1: KPI --- 总收入
chart_revenue = create_chart(SUPERSET_URL, headers, {
"slice_name": "总收入",
"viz_type": "big_number_total",
"datasource_id": dataset_id,
"datasource_type": "table",
"params": json.dumps({
"datasource": f"{dataset_id}__table",
"metric": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "总收入",
},
"y_axis_format": ",.0f",
"time_range": "This month",
"header_font_size": 0.4,
"subheader_font_size": 0.15,
}),
})
# 图表 2: KPI --- 订单数
chart_orders = create_chart(SUPERSET_URL, headers, {
"slice_name": "本月订单数",
"viz_type": "big_number_total",
"datasource_id": dataset_id,
"datasource_type": "table",
"params": json.dumps({
"datasource": f"{dataset_id}__table",
"metric": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_id"},
"aggregate": "COUNT",
"label": "订单数",
},
"y_axis_format": ",d",
"time_range": "This month",
}),
})
# 图表 3: 折线图 --- 月度趋势
chart_trend = create_chart(SUPERSET_URL, headers, {
"slice_name": "月度销售趋势",
"viz_type": "echarts_timeseries",
"datasource_id": dataset_id,
"datasource_type": "table",
"params": json.dumps({
"datasource": f"{dataset_id}__table",
"x_axis": "order_date",
"time_grain_sqla": "P1M",
"metrics": [
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "月度收入",
}
],
"groupby": [],
"row_limit": 10000,
"show_legend": True,
"rich_tooltip": True,
"y_axis_format": ",.0f",
"smooth": True,
"area": "None",
"time_range": "Last year",
}),
})
# 图表 4: 饼图 --- 区域分布
chart_region = create_chart(SUPERSET_URL, headers, {
"slice_name": "区域销售分布",
"viz_type": "pie",
"datasource_id": dataset_id,
"datasource_type": "table",
"params": json.dumps({
"datasource": f"{dataset_id}__table",
"groupby": ["customer_region"],
"metric": {
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "收入",
},
"row_limit": 10,
"color_scheme": "supersetColors",
"show_labels": True,
"label_type": "key_percent", # 标签类型:名称+百分比
"number_format": ",.0f",
"date_format": "smart_date",
"show_legend": True,
"legendType": "scroll",
"legendOrientation": "top",
"innerRadius": 40, # 内半径(>0 变为环形图)
"outerRadius": 80, # 外半径
"time_range": "This year",
}),
})
# 图表 5: 表格 --- Top 10 产品
chart_top_products = create_chart(SUPERSET_URL, headers, {
"slice_name": "Top 10 热销产品",
"viz_type": "table",
"datasource_id": dataset_id,
"datasource_type": "table",
"params": json.dumps({
"datasource": f"{dataset_id}__table",
"all_columns": [
"product_name",
"product_category",
],
"metrics": [
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_amount"},
"aggregate": "SUM",
"label": "总销售额",
},
{
"expressionType": "SIMPLE",
"column": {"column_name": "order_id"},
"aggregate": "COUNT",
"label": "订单数",
},
],
"order_by_cols": json.dumps([
["SUM(order_amount)", False] # 按总销售额降序
]),
"row_limit": 10, # Top 10
"page_length": 10,
"include_search": True, # 启用搜索框
"table_timestamp_format": "%Y-%m-%d",
"show_cell_bars": True, # 单元格内显示数据条
"color_pn": True, # 正负数颜色区分
"time_range": "This year",
}),
})
# --- 步骤 5: 创建仪表板 ---
print("\n[Step 5] 创建仪表板...")
dashboard_id = create_dashboard(SUPERSET_URL, headers, {
"dashboard_title": "销售分析驾驶舱",
"slug": "sales-dashboard",
"published": True,
})
# --- 步骤 6: 配置仪表板布局 ---
print("\n[Step 6] 配置仪表板布局...")
# 构造 position_json
position = {
"DASHBOARD_VERSION_KEY": "v2",
"ROOT_ID": {
"type": "ROOT",
"id": "ROOT_ID",
"children": ["GRID_ID"],
},
"GRID_ID": {
"type": "GRID",
"id": "GRID_ID",
"children": ["ROW-1", "ROW-2"],
"parents": ["ROOT_ID"],
},
"DASHBOARD_HEADER_ID": {
"type": "HEADER",
"id": "DASHBOARD_HEADER_ID",
"meta": {"text": "销售分析驾驶舱"},
},
# 第一行:KPI 卡片
"ROW-1": {
"type": "ROW",
"id": "ROW-1",
"children": ["CHART-KPI-1", "CHART-KPI-2"],
"parents": ["ROOT_ID", "GRID_ID"],
},
"CHART-KPI-1": {
"type": "CHART",
"id": "CHART-KPI-1",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-1"],
"meta": {"chartId": chart_revenue, "width": 6, "height": 40},
},
"CHART-KPI-2": {
"type": "CHART",
"id": "CHART-KPI-2",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-1"],
"meta": {"chartId": chart_orders, "width": 6, "height": 40},
},
# 第二行:趋势图 + 饼图
"ROW-2": {
"type": "ROW",
"id": "ROW-2",
"children": ["CHART-TREND", "CHART-REGION"],
"parents": ["ROOT_ID", "GRID_ID"],
},
"CHART-TREND": {
"type": "CHART",
"id": "CHART-TREND",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-2"],
"meta": {"chartId": chart_trend, "width": 8, "height": 60},
},
"CHART-REGION": {
"type": "CHART",
"id": "CHART-REGION",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-2"],
"meta": {"chartId": chart_region, "width": 4, "height": 60},
},
# 第三行:Top 产品表格
"ROW-3": {
"type": "ROW",
"id": "ROW-3",
"children": ["CHART-TOP"],
"parents": ["ROOT_ID", "GRID_ID"],
},
"CHART-TOP": {
"type": "CHART",
"id": "CHART-TOP",
"children": [],
"parents": ["ROOT_ID", "GRID_ID", "ROW-3"],
"meta": {"chartId": chart_top_products, "width": 12, "height": 50},
},
}
# 将第三行加入 GRID 的 children
position["GRID_ID"]["children"].append("ROW-3")
# 更新仪表板
update_response = requests.put(
f"{SUPERSET_URL}/api/v1/dashboard/{dashboard_id}",
headers=headers,
data=json.dumps({
"position_json": json.dumps(position),
"json_metadata": json.dumps({
"refresh_frequency": 300, # 5 分钟自动刷新
"color_scheme": "supersetColors",
}),
}),
)
if update_response.status_code == 200:
print(f"[INFO] 仪表板布局更新成功!")
else:
print(f"[WARN] 仪表板布局更新失败: {update_response.text}")
# --- 完成 ---
print("\n" + "=" * 60)
print(f" 构建完成!")
print(f" 仪表板访问地址: {SUPERSET_URL}/superset/dashboard/{dashboard_id}/")
print(f" 或短链接: {SUPERSET_URL}/superset/dashboard/sales-dashboard/")
print("=" * 60)
# ============================================================
# 入口点
# ============================================================
if __name__ == "__main__":
main()
总结速查表
┌──────────────────────────────────────────────────────────────┐
│ Superset 知识点速查表 │
├──────────────┬───────────────────────────────────────────────┤
│ 安装方式 │ Docker Compose / PyPI / Helm (K8s) │
├──────────────┼───────────────────────────────────────────────┤
│ 默认端口 │ 8088 │
├──────────────┼───────────────────────────────────────────────┤
│ 配置文件 │ superset_config.py │
├──────────────┼───────────────────────────────────────────────┤
│ 元数据库 │ PostgreSQL / MySQL (SQLAlchemy URI) │
├──────────────┼───────────────────────────────────────────────┤
│ 缓存层 │ Redis (CACHE_CONFIG / DATA_CACHE_CONFIG) │
├──────────────┼───────────────────────────────────────────────┤
│ 异步任务 │ Celery + Redis/RabbitMQ │
├──────────────┼───────────────────────────────────────────────┤
│ API 认证 │ JWT Bearer Token + CSRF Token │
├──────────────┼───────────────────────────────────────────────┤
│ SQL 模板 │ Jinja2 (from_dttm, to_dttm, filter_values) │
├──────────────┼───────────────────────────────────────────────┤
│ 图表类型 │ Line / Bar / Pie / Table / Pivot / Map 等 │
├──────────────┼───────────────────────────────────────────────┤
│ 高级分析 │ 滚动平均 / 时间比较 / 数据重采样 │
├──────────────┼───────────────────────────────────────────────┤
│ 权限模型 │ RBAC (角色) + RLS (行级安全) │
├──────────────┼───────────────────────────────────────────────┤
│ 报表推送 │ Alerts & Reports (邮件 / Slack / Webhook) │
├──────────────┼───────────────────────────────────────────────┤
│ 资产管理 │ 导出 YAML/ZIP → 导入 (superset CLI / API) │
├──────────────┼───────────────────────────────────────────────┤
│ 语义层 │ 虚拟指标 / 虚拟列 / 度量表达式 │
├──────────────┼───────────────────────────────────────────────┤
│ 文件上传 │ CSV / Excel → 自动建表 │
└──────────────┴───────────────────────────────────────────────┘