MongoDB分片案例

MongoDB分片案例

1.实际案例-1

  • 按地区和时间分区数据
  • 实现就近读取,降低查询延迟,减少网络开销
json 复制代码
1. 假设我们有一个 电商系统,包含:
2. 数据库:ecommerce
3. 集合:order(订单表)
4. 分片键:{ region: 1, order_date: 1 }(地区 + 订单日期)

1.1 对集合进行分片

json 复制代码
sh.shardCollection("ecommerce.orders", { region: 1, order_date: 1 })

1.2 将分片与区域关联

json 复制代码
db.adminCommand({ addShardToZone: "shard01", zone: "northChina" })
db.adminCommand({ addShardToZone: "shard02", zone: "southChina" })

1.3 创建分片键值与区域之间的关联

json 复制代码
var northChinaRegions = ["北京", "天津", "河北", "河南","山东","山西", "内蒙古","辽宁","吉林","黑龙江"];
northChinaRegions.forEach(function(region) {
    db.adminCommand({
        updateZoneKeyRange: "ecommerce.orders",
        min: { region: region, order_date: MinKey }, 
        max: { region: region, order_date: MaxKey }, 
        zone: "northChina"
    })
})


var southChinaRegions = ["上海", "江苏", "浙江", "安徽", "福建", "江西", "广东", "广西", "海南"];
southChinaRegions.forEach(function(region) {
    db.adminCommand({
        updateZoneKeyRange: "ecommerce.orders",
        min: { region: region, order_date: MinKey },
        max: { region: region, order_date: MaxKey },
        zone: "southChina"
    })
})

1.4 查看zone是否设置成功

json 复制代码
use config
db.tags.find({"ns": "ecommerce.orders","tag":"northChina"}).pretty()
db.tags.find({"ns": "ecommerce.orders","tag":"southChina"}).pretty()


1.5 插入数据

python 复制代码
# 按地区顺序插入数据,时间严格递增覆盖整个范围
import pymongo
from datetime import datetime, timedelta
import random
import math

# 连接
client = pymongo.MongoClient("mongodb://root:123@127.0.0.1:27019/admin")
db = client.ecommerce

# Zone配置
regions = {
    "northChina": ["北京", "天津", "河北", "河南", "山东", "山西", "内蒙古", "辽宁", "吉林", "黑龙江"],
    "southChina": ["上海", "江苏", "浙江", "安徽", "福建", "江西", "广东", "广西", "海南"]
}

# 每个地区插入100万行数据
TOTAL_PER_REGION = 1000000
BATCH = 5000  # 减小批次大小,避免内存问题

# 时间范围:2016-01-01 00:00:00 到 2025-12-31 23:59:59
start_date = datetime(2016, 1, 1)
end_date = datetime(2025, 12, 31, 23, 59, 59)
total_seconds = int((end_date - start_date).total_seconds())

print("开始按地区顺序插入数据(时间严格递增)...")
print(f"每个地区插入: {TOTAL_PER_REGION:,} 条数据")
print(f"时间范围: {start_date} 到 {end_date}")
print(f"总秒数: {total_seconds:,} 秒")

total_regions = sum(len(region_list) for region_list in regions.values())
region_data = []

# 为每个地区生成时间序列
for zone, region_list in regions.items():
    for region in region_list:
        region_data.append({
            "region": region,
            "zone": zone,
            "total_docs": TOTAL_PER_REGION
        })

# 按地区顺序插入
for idx, region_info in enumerate(region_data, 1):
    region = region_info["region"]
    zone = region_info["zone"]
    
    print(f"\n[{idx}/{len(region_data)}] 插入地区: {region} (Zone: {zone})")
    
    # 为该地区生成均匀分布的时间点
    inserted = 0
    
    # 方法:将总时间分成 TOTAL_PER_REGION 份
    time_increment = total_seconds / TOTAL_PER_REGION
    
    while inserted < TOTAL_PER_REGION:
        batch = []
        batch_size = min(BATCH, TOTAL_PER_REGION - inserted)
        
        for j in range(batch_size):
            # 计算当前文档的时间戳
            seconds_offset = int(time_increment * (inserted + j))
            doc_date = start_date + timedelta(seconds=seconds_offset)
            
            # 添加微小的随机扰动(0-59秒),避免完全均匀
            random_seconds = random.randint(0, 59)
            doc_date += timedelta(seconds=random_seconds)
            
            # 确保不超过结束日期
            if doc_date > end_date:
                doc_date = end_date - timedelta(days=random.randint(0, 7))
            
            batch.append({
                "order_id": f"ORD_{region}_{inserted+j+1:08d}_{doc_date.strftime('%Y%m%d_%H%M%S')}",
                "region": region,
                "order_date": doc_date,
                "amount": round(random.uniform(50, 2000), 2),
                "status": random.choice(['待付款', '已付款', '已发货']),
                "zone": zone,
                "insert_sequence": inserted + j + 1
            })
        
        try:
            db.orders.insert_many(batch, ordered=False)
            inserted += batch_size
            
            if inserted % 50000 == 0 or inserted == TOTAL_PER_REGION:
                progress_pct = (inserted / TOTAL_PER_REGION) * 100
                current_date = batch[0]["order_date"] if batch else start_date
                print(f"  进度: {inserted:,}/{TOTAL_PER_REGION:,} ({progress_pct:.1f}%) - 当前时间: {current_date.strftime('%Y-%m-%d %H:%M:%S')}")
                
        except Exception as e:
            print(f"  插入出错: {e}")
            # 重试当前批次
            continue
    
    # 验证该地区的数据
    count = db.orders.count_documents({"region": region})
    first_doc = db.orders.find_one({"region": region}, sort=[("order_date", 1)])
    last_doc = db.orders.find_one({"region": region}, sort=[("order_date", -1)])
    
    if first_doc and last_doc:
        print(f"✅ 完成地区: {region}")
        print(f"   插入数量: {count:,}")
        print(f"   时间范围: {first_doc['order_date'].strftime('%Y-%m-%d %H:%M:%S')} 到 {last_doc['order_date'].strftime('%Y-%m-%d %H:%M:%S')}")
    else:
        print(f"⚠️  地区 {region} 数据验证失败")

# 最终统计
print(f"\n{'='*60}")
print("所有地区插入完成!")
print("\n各地区数据统计:")

for zone, region_list in regions.items():
    print(f"\nZone: {zone}")
    print("-" * 30)
    total_zone_count = 0
    
    for region in region_list:
        count = db.orders.count_documents({"region": region})
        total_zone_count += count
        
        # 获取时间范围
        pipeline = [
            {"$match": {"region": region}},
            {"$group": {
                "_id": None,
                "min_date": {"$min": "$order_date"},
                "max_date": {"$max": "$order_date"},
                "count": {"$sum": 1}
            }}
        ]
        
        result = list(db.orders.aggregate(pipeline))
        if result:
            print(f"  {region}: {count:,} 条")
            print(f"    时间: {result[0]['min_date'].strftime('%Y-%m-%d')} 到 {result[0]['max_date'].strftime('%Y-%m-%d')}")
    
    print(f"  Zone总计: {total_zone_count:,} 条")

total_count = db.orders.count_documents({})
print(f"\n{'='*60}")
print(f"全局统计:")
print(f"总文档数: {total_count:,}")
print(f"总地区数: {len(region_data)}")
print(f"平均每个地区: {total_count/len(region_data):,.0f} 条")

client.close()

1.6 查看数据分布

python 复制代码
use ecommerce
db.orders.getShardDistribution()
相关推荐
小北方城市网2 小时前
Spring Boot 接口开发实战:RESTful 规范、参数校验与全局异常处理
java·jvm·数据库·spring boot·后端·python·mysql
AllData公司负责人2 小时前
【亲测好用】数据集成管理能力演示
java·大数据·数据库·开源
brevity_souls2 小时前
SQL Server 窗口函数简介
开发语言·javascript·数据库
倚-天-照-海2 小时前
Doris数据库基本概念
数据库
翼龙云_cloud2 小时前
阿里云渠道商:cpu 弹性扩容有哪些限制条件?
数据库·阿里云·云计算
陈聪.2 小时前
HRCE简单实验
linux·运维·数据库
APIshop3 小时前
实战代码解析:item_get——获取某鱼商品详情接口
java·linux·数据库
洛_尘3 小时前
MySQL 5:增删改查操作
数据库·mysql
老邓计算机毕设3 小时前
SSM养老院老人健康信息管理系统t4p4x(程序+源码+数据库+调试部署+开发环境)带论文文档1万字以上,文末可获取,系统界面在最后面
数据库·计算机毕业设计·ssm 框架·养老院老人健康管理系统