gRPC服务熔断与限流设计

gRPC服务熔断与限流设计

创建日期 : 2026-03-26
作者 : zry
标签: gRPC, 微服务, 熔断器, 限流, 弹性设计, AIDC


目录

  1. 引言
  2. 微服务故障模式分析
  3. 熔断器设计原理
  4. 限流算法与实现
  5. gRPC拦截器实现
  6. 分布式熔断与限流
  7. AIDC系统实战案例
  8. 监控与告警
  9. 最佳实践
  10. 总结

引言

在AIDC自动气象站数据收集系统中,Web模块通过gRPC与其他四个核心模块(ISOS、Connect、Update、MQTT)进行通信。当某个下游服务出现故障或性能下降时,如果不加以控制,故障可能会级联传播,最终导致整个系统崩溃。

本文将深入探讨如何通过熔断器和限流机制来增强gRPC服务的弹性,确保AIDC系统在高负载和故障场景下依然能够稳定运行。


微服务故障模式分析

AIDC系统故障场景

级联影响
故障模式
gRPC调用链
用户请求
客户端应用
Web模块:50051
服务发现
ISOS:50052
Connect:50053
Update:50054
MQTT:1883
响应延迟高
连接超时
CPU满载
消息堆积
Web线程池耗尽
全系统不可用

常见故障模式

故障模式 症状 影响 防护手段
服务超时 请求响应时间>阈值 线程池耗尽 熔断器开启
服务错误 HTTP 5xx/gRPC错误码 错误传播 熔断器计数
服务过载 CPU/内存饱和 响应变慢 限流器拒绝
网络分区 连接丢失 请求堆积 快速失败
级联故障 故障A导致B不可用 雪崩效应 舱壁隔离

熔断器设计原理

熔断器状态机

初始化
失败率 > 阈值
超时后
探测成功
探测失败
请求成功/失败计数
Closed
Open
HalfOpen
正常状态

请求正常转发

记录成功/失败
熔断状态

快速失败返回

定期健康检查
半开状态

放行少量请求

测试服务恢复

熔断器核心实现

cpp 复制代码
// circuit_breaker.hpp
#pragma once
#include <atomic>
#include <chrono>
#include <mutex>
#include <deque>
#include <functional>

namespace aidc::resilience {

// 熔断器状态
enum class CircuitState {
    CLOSED,      // 关闭状态,正常请求
    OPEN,        // 打开状态,拒绝请求
    HALF_OPEN    // 半开状态,试探请求
};

// 请求结果
enum class RequestResult {
    SUCCESS,
    FAILURE,
    TIMEOUT
};

// 熔断器配置
struct CircuitBreakerConfig {
    // 触发熔断的失败率阈值(百分比)
    double failure_threshold_percent = 50.0;
    
    // 触发熔断的最小请求数
    uint32_t min_request_threshold = 10;
    
    // 熔断后等待时间(进入半开状态)
    std::chrono::milliseconds open_duration{5000};
    
    // 半开状态允许的请求数
    uint32_t half_open_max_requests = 3;
    
    // 统计窗口大小
    std::chrono::milliseconds window_size{60000};
    
    // 慢调用阈值(超过视为失败)
    std::chrono::milliseconds slow_call_threshold{1000};
};

// 熔断器统计信息
struct CircuitBreakerStats {
    CircuitState current_state;
    uint64_t total_requests;
    uint64_t successful_requests;
    uint64_t failed_requests;
    uint64_t rejected_requests;
    double current_failure_rate;
    std::chrono::system_clock::time_point last_state_change;
};

// 熔断器实现
class CircuitBreaker {
public:
    explicit CircuitBreaker(const std::string& name,
                           const CircuitBreakerConfig& config);
    
    // 执行受保护的调用
    template<typename Func>
    auto execute(Func&& func) -> std::optional<decltype(func())> {
        if (!allow_request()) {
            return std::nullopt; // 熔断器打开,快速失败
        }
        
        auto start = std::chrono::steady_clock::now();
        try {
            auto result = func();
            auto duration = std::chrono::steady_clock::now() - start;
            
            if (duration > config_.slow_call_threshold) {
                record_result(RequestResult::TIMEOUT);
            } else {
                record_result(RequestResult::SUCCESS);
            }
            
            return result;
        } catch (...) {
            record_result(RequestResult::FAILURE);
            throw;
        }
    }
    
    // 手动记录结果(用于异步场景)
    void record_result(RequestResult result);
    
    // 获取当前状态
    CircuitState get_state() const;
    
    // 获取统计信息
    CircuitBreakerStats get_stats() const;
    
    // 强制状态转换(用于管理操作)
    void force_open();
    void force_closed();
    void force_half_open();

private:
    bool allow_request();
    void transition_to(CircuitState new_state);
    void update_failure_rate();
    bool should_trip();
    
    struct RequestRecord {
        RequestResult result;
        std::chrono::steady_clock::time_point timestamp;
    };
    
    std::string name_;
    CircuitBreakerConfig config_;
    
    std::atomic<CircuitState> state_{CircuitState::CLOSED};
    std::atomic<uint64_t> rejected_count_{0};
    
    mutable std::mutex records_mutex_;
    std::deque<RequestRecord> recent_records_;
    
    std::mutex state_mutex_;
    std::chrono::system_clock::time_point last_state_change_;
    std::chrono::steady_clock::time_point open_start_time_;
    std::atomic<uint32_t> half_open_requests_{0};
};

} // namespace aidc::resilience

熔断器实现细节

cpp 复制代码
// circuit_breaker.cpp
#include "circuit_breaker.hpp"
#include <spdlog/spdlog.h>

namespace aidc::resilience {

CircuitBreaker::CircuitBreaker(const std::string& name,
                               const CircuitBreakerConfig& config)
    : name_(name)
    , config_(config)
    , last_state_change_(std::chrono::system_clock::now()) {}

bool CircuitBreaker::allow_request() {
    CircuitState current = state_.load(std::memory_order_acquire);
    
    switch (current) {
        case CircuitState::CLOSED:
            return true;
            
        case CircuitState::OPEN: {
            // 检查是否到了半开探测时间
            auto now = std::chrono::steady_clock::now();
            if (now - open_start_time_ >= config_.open_duration) {
                std::lock_guard<std::mutex> lock(state_mutex_);
                if (state_.load(std::memory_order_acquire) == CircuitState::OPEN) {
                    transition_to(CircuitState::HALF_OPEN);
                    half_open_requests_ = 0;
                }
                return true;
            }
            ++rejected_count_;
            spdlog::warn("CircuitBreaker[{}]: Request rejected (OPEN)", name_);
            return false;
        }
        
        case CircuitState::HALF_OPEN: {
            // 限制半开状态的并发请求数
            uint32_t current = half_open_requests_.fetch_add(1);
            if (current < config_.half_open_max_requests) {
                return true;
            }
            --half_open_requests_;
            ++rejected_count_;
            return false;
        }
    }
    
    return false;
}

void CircuitBreaker::record_result(RequestResult result) {
    auto now = std::chrono::steady_clock::now();
    
    {
        std::lock_guard<std::mutex> lock(records_mutex_);
        
        // 清理过期记录
        auto cutoff = now - config_.window_size;
        while (!recent_records_.empty() && 
               recent_records_.front().timestamp < cutoff) {
            recent_records_.pop_front();
        }
        
        // 添加新记录
        recent_records_.push_back({result, now});
    }
    
    CircuitState current = state_.load(std::memory_order_acquire);
    
    if (current == CircuitState::HALF_OPEN) {
        --half_open_requests_;
        
        if (result == RequestResult::SUCCESS) {
            // 半开状态成功,准备关闭
            std::lock_guard<std::mutex> lock(state_mutex_);
            if (state_.load(std::memory_order_acquire) == CircuitState::HALF_OPEN) {
                // 检查成功率
                transition_to(CircuitState::CLOSED);
                spdlog::info("CircuitBreaker[{}]: Closed (recovery confirmed)", name_);
            }
        } else {
            // 半开状态失败,重新打开
            std::lock_guard<std::mutex> lock(state_mutex_);
            if (state_.load(std::memory_order_acquire) == CircuitState::HALF_OPEN) {
                transition_to(CircuitState::OPEN);
                spdlog::warn("CircuitBreaker[{}]: Re-opened (recovery failed)", name_);
            }
        }
    } else if (current == CircuitState::CLOSED) {
        // 检查是否需要熔断
        if (should_trip()) {
            std::lock_guard<std::mutex> lock(state_mutex_);
            if (state_.load(std::memory_order_acquire) == CircuitState::CLOSED) {
                transition_to(CircuitState::OPEN);
                spdlog::error("CircuitBreaker[{}]: Tripped OPEN", name_);
            }
        }
    }
}

bool CircuitBreaker::should_trip() {
    std::lock_guard<std::mutex> lock(records_mutex_);
    
    if (recent_records_.size() < config_.min_request_threshold) {
        return false;
    }
    
    uint64_t failures = 0;
    for (const auto& record : recent_records_) {
        if (record.result != RequestResult::SUCCESS) {
            ++failures;
        }
    }
    
    double failure_rate = (static_cast<double>(failures) / recent_records_.size()) * 100.0;
    return failure_rate >= config_.failure_threshold_percent;
}

void CircuitBreaker::transition_to(CircuitState new_state) {
    CircuitState old_state = state_.exchange(new_state, std::memory_order_release);
    
    if (old_state != new_state) {
        last_state_change_ = std::chrono::system_clock::now();
        
        if (new_state == CircuitState::OPEN) {
            open_start_time_ = std::chrono::steady_clock::now();
        }
        
        spdlog::info("CircuitBreaker[{}]: State changed {} -> {}",
                     name_, 
                     static_cast<int>(old_state),
                     static_cast<int>(new_state));
    }
}

CircuitState CircuitBreaker::get_state() const {
    return state_.load(std::memory_order_acquire);
}

CircuitBreakerStats CircuitBreaker::get_stats() const {
    std::lock_guard<std::mutex> lock(records_mutex_);
    
    CircuitBreakerStats stats;
    stats.current_state = get_state();
    stats.total_requests = recent_records_.size();
    stats.rejected_requests = rejected_count_.load();
    stats.last_state_change = last_state_change_;
    
    uint64_t failures = 0;
    for (const auto& record : recent_records_) {
        if (record.result == RequestResult::SUCCESS) {
            ++stats.successful_requests;
        } else {
            ++stats.failed_requests;
            ++failures;
        }
    }
    
    stats.current_failure_rate = stats.total_requests > 0 
        ? (static_cast<double>(failures) / stats.total_requests) * 100.0 
        : 0.0;
    
    return stats;
}

void CircuitBreaker::force_open() {
    std::lock_guard<std::mutex> lock(state_mutex_);
    transition_to(CircuitState::OPEN);
}

void CircuitBreaker::force_closed() {
    std::lock_guard<std::mutex> lock(state_mutex_);
    {
        std::lock_guard<std::mutex> records_lock(records_mutex_);
        recent_records_.clear();
    }
    transition_to(CircuitState::CLOSED);
}

void CircuitBreaker::force_half_open() {
    std::lock_guard<std::mutex> lock(state_mutex_);
    transition_to(CircuitState::HALF_OPEN);
    half_open_requests_ = 0;
}

} // namespace aidc::resilience

限流算法与实现

限流算法对比

漏桶
平滑处理
请求入桶
固定速率流出
令牌桶
固定速率
令牌生成
请求消耗令牌
滑动窗口
平滑过渡
持续滑动
精确计数
固定窗口
重置
窗口1: 0-60s
窗口2: 60-120s

令牌桶限流器

cpp 复制代码
// rate_limiter.hpp
#pragma once
#include <atomic>
#include <chrono>
#include <mutex>
#include <deque>

namespace aidc::resilience {

// 限流器接口
class RateLimiter {
public:
    virtual ~RateLimiter() = default;
    
    // 尝试获取许可
    virtual bool try_acquire() = 0;
    
    // 尝试获取多个许可
    virtual bool try_acquire(uint32_t permits) = 0;
    
    // 阻塞等待获取许可(带超时)
    virtual bool acquire_with_timeout(std::chrono::milliseconds timeout) = 0;
    
    // 获取当前可用许可数
    virtual uint32_t available_permits() const = 0;
};

// 令牌桶限流器
class TokenBucketRateLimiter : public RateLimiter {
public:
    TokenBucketRateLimiter(uint32_t max_permits,
                          std::chrono::milliseconds refill_period,
                          uint32_t refill_amount);
    
    bool try_acquire() override;
    bool try_acquire(uint32_t permits) override;
    bool acquire_with_timeout(std::chrono::milliseconds timeout) override;
    uint32_t available_permits() const override;

private:
    void refill();
    
    const uint32_t max_permits_;
    const std::chrono::milliseconds refill_period_;
    const uint32_t refill_amount_;
    
    mutable std::mutex mutex_;
    std::atomic<double> available_tokens_{0};
    std::chrono::steady_clock::time_point last_refill_;
};

// 滑动窗口限流器
class SlidingWindowRateLimiter : public RateLimiter {
public:
    SlidingWindowRateLimiter(uint32_t max_requests,
                            std::chrono::milliseconds window_size);
    
    bool try_acquire() override;
    bool try_acquire(uint32_t permits) override;
    bool acquire_with_timeout(std::chrono::milliseconds timeout) override;
    uint32_t available_permits() const override;

private:
    void cleanup_old_requests();
    
    const uint32_t max_requests_;
    const std::chrono::milliseconds window_size_;
    
    mutable std::mutex mutex_;
    std::deque<std::chrono::steady_clock::time_point> requests_;
};

} // namespace aidc::resilience

令牌桶实现

cpp 复制代码
// token_bucket.cpp
#include "rate_limiter.hpp"
#include <algorithm>
#include <condition_variable>

namespace aidc::resilience {

TokenBucketRateLimiter::TokenBucketRateLimiter(
    uint32_t max_permits,
    std::chrono::milliseconds refill_period,
    uint32_t refill_amount)
    : max_permits_(max_permits)
    , refill_period_(refill_period)
    , refill_amount_(refill_amount)
    , last_refill_(std::chrono::steady_clock::now()) {
    available_tokens_.store(static_cast<double>(max_permits));
}

void TokenBucketRateLimiter::refill() {
    auto now = std::chrono::steady_clock::now();
    auto elapsed = now - last_refill_;
    
    if (elapsed >= refill_period_) {
        uint64_t periods = elapsed / refill_period_;
        double tokens_to_add = periods * refill_amount_;
        
        double current = available_tokens_.load(std::memory_order_relaxed);
        double new_tokens = std::min(current + tokens_to_add, 
                                     static_cast<double>(max_permits_));
        
        available_tokens_.store(new_tokens, std::memory_order_relaxed);
        last_refill_ = now;
    }
}

bool TokenBucketRateLimiter::try_acquire() {
    return try_acquire(1);
}

bool TokenBucketRateLimiter::try_acquire(uint32_t permits) {
    std::lock_guard<std::mutex> lock(mutex_);
    refill();
    
    double current = available_tokens_.load(std::memory_order_relaxed);
    if (current >= permits) {
        available_tokens_.store(current - permits, std::memory_order_relaxed);
        return true;
    }
    return false;
}

bool TokenBucketRateLimiter::acquire_with_timeout(
    std::chrono::milliseconds timeout) {
    
    auto deadline = std::chrono::steady_clock::now() + timeout;
    
    while (std::chrono::steady_clock::now() < deadline) {
        if (try_acquire()) {
            return true;
        }
        
        // 计算等待时间
        std::lock_guard<std::mutex> lock(mutex_);
        refill();
        
        double needed = 1.0 - available_tokens_.load(std::memory_order_relaxed);
        if (needed > 0) {
            double periods_needed = needed / refill_amount_;
            auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(
                periods_needed * refill_period_);
            
            std::this_thread::sleep_for(std::min(wait_time, 
                std::chrono::milliseconds(10)));
        }
    }
    
    return false;
}

uint32_t TokenBucketRateLimiter::available_permits() const {
    std::lock_guard<std::mutex> lock(mutex_);
    const_cast<TokenBucketRateLimiter*>(this)->refill();
    return static_cast<uint32_t>(available_tokens_.load(
        std::memory_order_relaxed));
}

} // namespace aidc::resilience

gRPC拦截器实现

熔断拦截器

cpp 复制代码
// grpc_circuit_breaker_interceptor.hpp
#pragma once
#include <grpcpp/grpcpp.h>
#include "circuit_breaker.hpp"

namespace aidc::grpc {

// 客户端熔断拦截器
class CircuitBreakerInterceptor 
    : public grpc::experimental::Interceptor {
public:
    CircuitBreakerInterceptor(
        grpc::experimental::ClientRpcInfo* info,
        std::shared_ptr<resilience::CircuitBreaker> circuit_breaker);
    
    void Intercept(grpc::experimental::InterceptorBatchMethods* methods) override;

private:
    std::shared_ptr<resilience::CircuitBreaker> circuit_breaker_;
    bool intercepted_{false};
};

// 客户端拦截器工厂
class CircuitBreakerInterceptorFactory 
    : public grpc::experimental::ClientInterceptorFactoryInterface {
public:
    explicit CircuitBreakerInterceptorFactory(
        std::shared_ptr<resilience::CircuitBreaker> cb)
        : circuit_breaker_(cb) {}
    
    grpc::experimental::Interceptor* CreateClientInterceptor(
        grpc::experimental::ClientRpcInfo* info) override {
        return new CircuitBreakerInterceptor(info, circuit_breaker_);
    }

private:
    std::shared_ptr<resilience::CircuitBreaker> circuit_breaker_;
};

// 服务端限流拦截器
class RateLimitInterceptor 
    : public grpc::experimental::Interceptor {
public:
    RateLimitInterceptor(
        grpc::experimental::ServerRpcInfo* info,
        std::shared_ptr<resilience::RateLimiter> rate_limiter);
    
    void Intercept(grpc::experimental::InterceptorBatchMethods* methods) override;

private:
    std::shared_ptr<resilience::RateLimiter> rate_limiter_;
};

class RateLimitInterceptorFactory 
    : public grpc::experimental::ServerInterceptorFactoryInterface {
public:
    explicit RateLimitInterceptorFactory(
        std::shared_ptr<resilience::RateLimiter> rl)
        : rate_limiter_(rl) {}
    
    grpc::experimental::Interceptor* CreateServerInterceptor(
        grpc::experimental::ServerRpcInfo* info) override {
        return new RateLimitInterceptor(info, rate_limiter_);
    }

private:
    std::shared_ptr<resilience::RateLimiter> rate_limiter_;
};

} // namespace aidc::grpc

拦截器实现

cpp 复制代码
// grpc_circuit_breaker_interceptor.cpp
#include "grpc_circuit_breaker_interceptor.hpp"
#include <spdlog/spdlog.h>

namespace aidc::grpc {

CircuitBreakerInterceptor::CircuitBreakerInterceptor(
    grpc::experimental::ClientRpcInfo* info,
    std::shared_ptr<resilience::CircuitBreaker> circuit_breaker)
    : circuit_breaker_(circuit_breaker) {}

void CircuitBreakerInterceptor::Intercept(
    grpc::experimental::InterceptorBatchMethods* methods) {
    
    if (methods->QueryInterceptionHookPoint(
            grpc::experimental::InterceptionHookPoints::PRE_SEND_INITIAL_METADATA)) {
        
        // 检查熔断器状态
        if (circuit_breaker_->get_state() == 
            resilience::CircuitState::OPEN) {
            
            // 熔断器打开,快速失败
            grpc::Status status(grpc::StatusCode::UNAVAILABLE,
                               "Circuit breaker is OPEN");
            
            auto* status_ptr = methods->GetRecvStatus();
            if (status_ptr) {
                *status_ptr = status;
            }
            
            spdlog::warn("gRPC call blocked by circuit breaker");
            
            // 跳过后续处理
            methods->Hijack();
            return;
        }
        
        intercepted_ = true;
    }
    
    if (methods->QueryInterceptionHookPoint(
            grpc::experimental::InterceptionHookPoints::POST_RECV_STATUS)) {
        
        if (intercepted_) {
            auto* status = methods->GetRecvStatus();
            if (status) {
                if (status->ok()) {
                    circuit_breaker_->record_result(
                        resilience::RequestResult::SUCCESS);
                } else {
                    // 根据错误码判断是否为失败
                    auto code = status->error_code();
                    if (code == grpc::StatusCode::DEADLINE_EXCEEDED ||
                        code == grpc::StatusCode::UNAVAILABLE ||
                        code == grpc::StatusCode::INTERNAL) {
                        circuit_breaker_->record_result(
                            resilience::RequestResult::FAILURE);
                    }
                }
            }
        }
    }
    
    methods->Proceed();
}

RateLimitInterceptor::RateLimitInterceptor(
    grpc::experimental::ServerRpcInfo* info,
    std::shared_ptr<resilience::RateLimiter> rate_limiter)
    : rate_limiter_(rate_limiter) {}

void RateLimitInterceptor::Intercept(
    grpc::experimental::InterceptorBatchMethods* methods) {
    
    if (methods->QueryInterceptionHookPoint(
            grpc::experimental::InterceptionHookPoints::PRE_SEND_INITIAL_METADATA)) {
        
        // 尝试获取许可
        if (!rate_limiter_->try_acquire()) {
            // 限流触发
            grpc::Status status(grpc::StatusCode::RESOURCE_EXHAUSTED,
                               "Rate limit exceeded");
            
            auto* status_ptr = methods->GetRecvStatus();
            if (status_ptr) {
                *status_ptr = status;
            }
            
            spdlog::warn("gRPC call rejected by rate limiter");
            
            methods->Hijack();
            return;
        }
    }
    
    methods->Proceed();
}

} // namespace aidc::grpc

分布式熔断与限流

Redis-based分布式限流

cpp 复制代码
// distributed_rate_limiter.hpp
#pragma once
#include "rate_limiter.hpp"
#include <sw/redis++/redis.h>

namespace aidc::resilience {

// 分布式滑动窗口限流器
class DistributedSlidingWindowLimiter : public RateLimiter {
public:
    DistributedSlidingWindowLimiter(
        std::shared_ptr<sw::redis::Redis> redis,
        const std::string& key_prefix,
        uint32_t max_requests,
        std::chrono::milliseconds window_size);
    
    bool try_acquire() override;
    bool try_acquire(uint32_t permits) override;
    bool acquire_with_timeout(std::chrono::milliseconds timeout) override;
    uint32_t available_permits() const override;

private:
    std::shared_ptr<sw::redis::Redis> redis_;
    std::string key_prefix_;
    uint32_t max_requests_;
    std::chrono::milliseconds window_size_;
    
    // Lua脚本保证原子性
    static constexpr const char* SLIDING_WINDOW_SCRIPT = R"(
        local key = KEYS[1]
        local window = tonumber(ARGV[1])
        local limit = tonumber(ARGV[2])
        local now = tonumber(ARGV[3])
        
        -- 清理过期请求
        redis.call('ZREMRANGEBYSCORE', key, 0, now - window)
        
        -- 获取当前窗口内的请求数
        local current = redis.call('ZCARD', key)
        
        if current < limit then
            -- 允许请求,添加时间戳
            redis.call('ZADD', key, now, now)
            redis.call('EXPIRE', key, math.ceil(window / 1000))
            return 1
        else
            return 0
        end
    )";
};

} // namespace aidc::resilience

AIDC系统实战案例

模块间熔断配置

cpp 复制代码
// aidc_circuit_breaker_config.hpp
#pragma once
#include "resilience/circuit_breaker.hpp"
#include "resilience/rate_limiter.hpp"
#include <unordered_map>
#include <string>

namespace aidc {

// AIDC各模块的熔断/限流配置
class AidcResilienceConfig {
public:
    // 获取Web->ISOS调用的熔断器
    static resilience::CircuitBreakerConfig get_isos_circuit_config() {
        return {
            .failure_threshold_percent = 60.0,  // 60%失败率熔断
            .min_request_threshold = 20,
            .open_duration = std::chrono::milliseconds(10000),
            .half_open_max_requests = 5,
            .window_size = std::chrono::milliseconds(60000),
            .slow_call_threshold = std::chrono::milliseconds(2000)
        };
    }
    
    // 获取Web->Connect调用的熔断器
    static resilience::CircuitBreakerConfig get_connect_circuit_config() {
        return {
            .failure_threshold_percent = 50.0,
            .min_request_threshold = 10,
            .open_duration = std::chrono::milliseconds(5000),
            .half_open_max_requests = 3,
            .window_size = std::chrono::milliseconds(60000),
            .slow_call_threshold = std::chrono::milliseconds(1000)
        };
    }
    
    // 获取服务端限流配置
    static std::unique_ptr<resilience::RateLimiter> get_server_rate_limiter() {
        // 每秒1000个请求,突发2000
        return std::make_unique<resilience::TokenBucketRateLimiter>(
            2000,                           // 最大令牌数
            std::chrono::milliseconds(100), // 每100ms补充
            100                             // 补充100个令牌
        );
    }
    
    // 获取设备连接限流配置
    static std::unique_ptr<resilience::RateLimiter> get_device_rate_limiter() {
        // 每设备每分钟10次连接
        return std::make_unique<resilience::SlidingWindowRateLimiter>(
            10,                           // 最大请求数
            std::chrono::minutes(1)       // 1分钟窗口
        );
    }
};

} // namespace aidc

Web模块集成示例

cpp 复制代码
// web_resilience_integration.cpp
#include "aidc_circuit_breaker_config.hpp"
#include "grpc_circuit_breaker_interceptor.hpp"
#include <grpcpp/create_channel.h>

namespace aidc::web {

class ResilientGrpcClientManager {
public:
    ResilientGrpcClientManager() {
        // 初始化各服务的熔断器
        isos_circuit_ = std::make_shared<resilience::CircuitBreaker>(
            "isos-service", 
            AidcResilienceConfig::get_isos_circuit_config());
        
        connect_circuit_ = std::make_shared<resilience::CircuitBreaker>(
            "connect-service",
            AidcResilienceConfig::get_connect_circuit_config());
        
        // 创建带熔断器的gRPC channel
        isos_channel_ = create_channel_with_circuit_breaker(
            "localhost:50052", isos_circuit_);
        
        connect_channel_ = create_channel_with_circuit_breaker(
            "localhost:50053", connect_circuit_);
    }
    
    std::shared_ptr<grpc::Channel> get_isos_channel() { return isos_channel_; }
    std::shared_ptr<grpc::Channel> get_connect_channel() { return connect_channel_; }
    
    // 获取熔断器状态(用于监控)
    resilience::CircuitBreakerStats get_isos_stats() const {
        return isos_circuit_->get_stats();
    }
    
    resilience::CircuitBreakerStats get_connect_stats() const {
        return connect_circuit_->get_stats();
    }

private:
    std::shared_ptr<grpc::Channel> create_channel_with_circuit_breaker(
        const std::string& target,
        std::shared_ptr<resilience::CircuitBreaker> cb) {
        
        std::vector<std::unique_ptr<grpc::experimental::ClientInterceptorFactoryInterface>>
            interceptor_creators;
        
        interceptor_creators.push_back(
            std::make_unique<grpc::CircuitBreakerInterceptorFactory>(cb));
        
        grpc::ChannelArguments args;
        args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, 10000);
        args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 5000);
        
        return grpc::experimental::CreateCustomChannelWithInterceptors(
            target,
            grpc::InsecureChannelCredentials(),
            args,
            std::move(interceptor_creators));
    }
    
    std::shared_ptr<resilience::CircuitBreaker> isos_circuit_;
    std::shared_ptr<resilience::CircuitBreaker> connect_circuit_;
    
    std::shared_ptr<grpc::Channel> isos_channel_;
    std::shared_ptr<grpc::Channel> connect_channel_;
};

} // namespace aidc::web

监控与告警

熔断器监控指标

cpp 复制代码
// resilience_metrics.hpp
#pragma once
#include "circuit_breaker.hpp"
#include <prometheus/counter.h>
#include <prometheus/gauge.h>
#include <prometheus/histogram.h>

namespace aidc::resilience {

class ResilienceMetrics {
public:
    explicit ResilienceMetrics(prometheus::Registry& registry);
    
    // 记录熔断器状态变化
    void record_circuit_state_change(const std::string& name,
                                     CircuitState from,
                                     CircuitState to);
    
    // 记录请求结果
    void record_request(const std::string& name,
                       RequestResult result,
                       std::chrono::milliseconds duration);
    
    // 记录限流事件
    void record_rate_limit(const std::string& name);
    
    // 更新熔断器状态指标
    void update_circuit_state(const std::string& name, CircuitState state);
    
    // 更新当前失败率
    void update_failure_rate(const std::string& name, double rate);

private:
    prometheus::Counter& circuit_transitions_;
    prometheus::Counter& total_requests_;
    prometheus::Counter& rejected_requests_;
    prometheus::Gauge& circuit_state_;
    prometheus::Gauge& failure_rate_;
    prometheus::Histogram& request_duration_;
};

} // namespace aidc::resilience

最佳实践

1. 熔断器配置建议

cpp 复制代码
// 针对不同场景的配置模板
namespace circuit_breaker_templates {

// 关键服务配置(低容错)
resilience::CircuitBreakerConfig critical_service() {
    return {
        .failure_threshold_percent = 30.0,  // 快速熔断
        .min_request_threshold = 5,
        .open_duration = std::chrono::seconds(30),
        .half_open_max_requests = 1,
        .window_size = std::chrono::seconds(30),
        .slow_call_threshold = std::chrono::milliseconds(500)
    };
}

// 非关键服务配置(高容错)
resilience::CircuitBreakerConfig non_critical_service() {
    return {
        .failure_threshold_percent = 80.0,  // 容忍更多失败
        .min_request_threshold = 50,
        .open_duration = std::chrono::seconds(5),
        .half_open_max_requests = 10,
        .window_size = std::chrono::minutes(5),
        .slow_call_threshold = std::chrono::seconds(5)
    };
}

// 外部依赖配置(网络不稳定)
resilience::CircuitBreakerConfig external_dependency() {
    return {
        .failure_threshold_percent = 50.0,
        .min_request_threshold = 20,
        .open_duration = std::chrono::seconds(60),
        .half_open_max_requests = 3,
        .window_size = std::chrono::minutes(2),
        .slow_call_threshold = std::chrono::seconds(10)
    };
}

} // namespace circuit_breaker_templates

2. 降级策略

cpp 复制代码
// 服务降级处理器
template<typename Response>
class FallbackHandler {
public:
    using FallbackFunc = std::function<Response()>;
    
    explicit FallbackHandler(FallbackFunc fallback) 
        : fallback_(std::move(fallback)) {}
    
    Response execute_with_fallback(
        std::function<Response()> primary,
        resilience::CircuitBreaker& cb) {
        
        auto result = cb.execute(primary);
        if (result) {
            return *result;
        }
        
        // 熔断器打开,执行降级
        spdlog::warn("Circuit breaker open, executing fallback");
        return fallback_();
    }

private:
    FallbackFunc fallback_;
};

// 使用示例
class WeatherServiceClient {
public:
    WeatherData get_weather_data(const std::string& station_id) {
        auto primary = [&]() { return call_grpc_get_weather(station_id); };
        auto fallback = [&]() { return read_cached_weather(station_id); };
        
        FallbackHandler<WeatherData> handler(fallback);
        return handler.execute_with_fallback(primary, circuit_breaker_);
    }
};

总结

通过实施熔断器和限流机制,AIDC系统的弹性得到了显著提升:

  1. 故障隔离:熔断器防止故障级联传播
  2. 过载保护:限流器保护服务免受过载冲击
  3. 优雅降级:配合降级策略保证核心功能可用
  4. 自愈能力:自动检测服务恢复并恢复流量

关键配置建议:

场景 熔断阈值 恢复时间 限流QPS
核心服务 30% 30s 1000
普通服务 50% 10s 2000
外部依赖 60% 60s 500

https://github.com/0voice

相关推荐
6Hzlia2 小时前
【Hot 100 刷题计划】 LeetCode 41. 缺失的第一个正数 | C++ 原地哈希题解
c++·leetcode·哈希算法
十五年专注C++开发2 小时前
达梦数据库在Linux备份报错 -8003: 缺少本地或者远程归档 解决方案
数据库·c++·dm·备份复原
yy_xzz2 小时前
【Linux开发】I/O 复用:select 模型
linux·c++·select
小肝一下2 小时前
每日两道力扣,day6
数据结构·c++·算法·leetcode·双指针·hot100
kang0x03 小时前
Night Coder - Writeup by AI
安全
ambition202423 小时前
【算法详解】飞机降落问题:DFS剪枝解决调度问题
c语言·数据结构·c++·算法·深度优先·图搜索算法
紫金桥软件3 小时前
国产化 + 跨平台,紫金桥组态软件夯实新能源企业“自主底座”
安全·scada·组态软件·国产工业软件·监控组态软件
I Promise343 小时前
C++ 基础数据结构与 STL 容器详解
开发语言·数据结构·c++