gRPC服务熔断与限流设计
创建日期 : 2026-03-26
作者 : zry
标签: gRPC, 微服务, 熔断器, 限流, 弹性设计, AIDC
目录
引言
在AIDC自动气象站数据收集系统中,Web模块通过gRPC与其他四个核心模块(ISOS、Connect、Update、MQTT)进行通信。当某个下游服务出现故障或性能下降时,如果不加以控制,故障可能会级联传播,最终导致整个系统崩溃。
本文将深入探讨如何通过熔断器和限流机制来增强gRPC服务的弹性,确保AIDC系统在高负载和故障场景下依然能够稳定运行。
微服务故障模式分析
AIDC系统故障场景
级联影响
故障模式
gRPC调用链
用户请求
客户端应用
Web模块:50051
服务发现
ISOS:50052
Connect:50053
Update:50054
MQTT:1883
响应延迟高
连接超时
CPU满载
消息堆积
Web线程池耗尽
全系统不可用
常见故障模式
| 故障模式 | 症状 | 影响 | 防护手段 |
|---|---|---|---|
| 服务超时 | 请求响应时间>阈值 | 线程池耗尽 | 熔断器开启 |
| 服务错误 | HTTP 5xx/gRPC错误码 | 错误传播 | 熔断器计数 |
| 服务过载 | CPU/内存饱和 | 响应变慢 | 限流器拒绝 |
| 网络分区 | 连接丢失 | 请求堆积 | 快速失败 |
| 级联故障 | 故障A导致B不可用 | 雪崩效应 | 舱壁隔离 |
熔断器设计原理
熔断器状态机
初始化
失败率 > 阈值
超时后
探测成功
探测失败
请求成功/失败计数
Closed
Open
HalfOpen
正常状态
请求正常转发
记录成功/失败
熔断状态
快速失败返回
定期健康检查
半开状态
放行少量请求
测试服务恢复
熔断器核心实现
cpp
// circuit_breaker.hpp
#pragma once
#include <atomic>
#include <chrono>
#include <mutex>
#include <deque>
#include <functional>
namespace aidc::resilience {
// 熔断器状态
enum class CircuitState {
CLOSED, // 关闭状态,正常请求
OPEN, // 打开状态,拒绝请求
HALF_OPEN // 半开状态,试探请求
};
// 请求结果
enum class RequestResult {
SUCCESS,
FAILURE,
TIMEOUT
};
// 熔断器配置
struct CircuitBreakerConfig {
// 触发熔断的失败率阈值(百分比)
double failure_threshold_percent = 50.0;
// 触发熔断的最小请求数
uint32_t min_request_threshold = 10;
// 熔断后等待时间(进入半开状态)
std::chrono::milliseconds open_duration{5000};
// 半开状态允许的请求数
uint32_t half_open_max_requests = 3;
// 统计窗口大小
std::chrono::milliseconds window_size{60000};
// 慢调用阈值(超过视为失败)
std::chrono::milliseconds slow_call_threshold{1000};
};
// 熔断器统计信息
struct CircuitBreakerStats {
CircuitState current_state;
uint64_t total_requests;
uint64_t successful_requests;
uint64_t failed_requests;
uint64_t rejected_requests;
double current_failure_rate;
std::chrono::system_clock::time_point last_state_change;
};
// 熔断器实现
class CircuitBreaker {
public:
explicit CircuitBreaker(const std::string& name,
const CircuitBreakerConfig& config);
// 执行受保护的调用
template<typename Func>
auto execute(Func&& func) -> std::optional<decltype(func())> {
if (!allow_request()) {
return std::nullopt; // 熔断器打开,快速失败
}
auto start = std::chrono::steady_clock::now();
try {
auto result = func();
auto duration = std::chrono::steady_clock::now() - start;
if (duration > config_.slow_call_threshold) {
record_result(RequestResult::TIMEOUT);
} else {
record_result(RequestResult::SUCCESS);
}
return result;
} catch (...) {
record_result(RequestResult::FAILURE);
throw;
}
}
// 手动记录结果(用于异步场景)
void record_result(RequestResult result);
// 获取当前状态
CircuitState get_state() const;
// 获取统计信息
CircuitBreakerStats get_stats() const;
// 强制状态转换(用于管理操作)
void force_open();
void force_closed();
void force_half_open();
private:
bool allow_request();
void transition_to(CircuitState new_state);
void update_failure_rate();
bool should_trip();
struct RequestRecord {
RequestResult result;
std::chrono::steady_clock::time_point timestamp;
};
std::string name_;
CircuitBreakerConfig config_;
std::atomic<CircuitState> state_{CircuitState::CLOSED};
std::atomic<uint64_t> rejected_count_{0};
mutable std::mutex records_mutex_;
std::deque<RequestRecord> recent_records_;
std::mutex state_mutex_;
std::chrono::system_clock::time_point last_state_change_;
std::chrono::steady_clock::time_point open_start_time_;
std::atomic<uint32_t> half_open_requests_{0};
};
} // namespace aidc::resilience
熔断器实现细节
cpp
// circuit_breaker.cpp
#include "circuit_breaker.hpp"
#include <spdlog/spdlog.h>
namespace aidc::resilience {
CircuitBreaker::CircuitBreaker(const std::string& name,
const CircuitBreakerConfig& config)
: name_(name)
, config_(config)
, last_state_change_(std::chrono::system_clock::now()) {}
bool CircuitBreaker::allow_request() {
CircuitState current = state_.load(std::memory_order_acquire);
switch (current) {
case CircuitState::CLOSED:
return true;
case CircuitState::OPEN: {
// 检查是否到了半开探测时间
auto now = std::chrono::steady_clock::now();
if (now - open_start_time_ >= config_.open_duration) {
std::lock_guard<std::mutex> lock(state_mutex_);
if (state_.load(std::memory_order_acquire) == CircuitState::OPEN) {
transition_to(CircuitState::HALF_OPEN);
half_open_requests_ = 0;
}
return true;
}
++rejected_count_;
spdlog::warn("CircuitBreaker[{}]: Request rejected (OPEN)", name_);
return false;
}
case CircuitState::HALF_OPEN: {
// 限制半开状态的并发请求数
uint32_t current = half_open_requests_.fetch_add(1);
if (current < config_.half_open_max_requests) {
return true;
}
--half_open_requests_;
++rejected_count_;
return false;
}
}
return false;
}
void CircuitBreaker::record_result(RequestResult result) {
auto now = std::chrono::steady_clock::now();
{
std::lock_guard<std::mutex> lock(records_mutex_);
// 清理过期记录
auto cutoff = now - config_.window_size;
while (!recent_records_.empty() &&
recent_records_.front().timestamp < cutoff) {
recent_records_.pop_front();
}
// 添加新记录
recent_records_.push_back({result, now});
}
CircuitState current = state_.load(std::memory_order_acquire);
if (current == CircuitState::HALF_OPEN) {
--half_open_requests_;
if (result == RequestResult::SUCCESS) {
// 半开状态成功,准备关闭
std::lock_guard<std::mutex> lock(state_mutex_);
if (state_.load(std::memory_order_acquire) == CircuitState::HALF_OPEN) {
// 检查成功率
transition_to(CircuitState::CLOSED);
spdlog::info("CircuitBreaker[{}]: Closed (recovery confirmed)", name_);
}
} else {
// 半开状态失败,重新打开
std::lock_guard<std::mutex> lock(state_mutex_);
if (state_.load(std::memory_order_acquire) == CircuitState::HALF_OPEN) {
transition_to(CircuitState::OPEN);
spdlog::warn("CircuitBreaker[{}]: Re-opened (recovery failed)", name_);
}
}
} else if (current == CircuitState::CLOSED) {
// 检查是否需要熔断
if (should_trip()) {
std::lock_guard<std::mutex> lock(state_mutex_);
if (state_.load(std::memory_order_acquire) == CircuitState::CLOSED) {
transition_to(CircuitState::OPEN);
spdlog::error("CircuitBreaker[{}]: Tripped OPEN", name_);
}
}
}
}
bool CircuitBreaker::should_trip() {
std::lock_guard<std::mutex> lock(records_mutex_);
if (recent_records_.size() < config_.min_request_threshold) {
return false;
}
uint64_t failures = 0;
for (const auto& record : recent_records_) {
if (record.result != RequestResult::SUCCESS) {
++failures;
}
}
double failure_rate = (static_cast<double>(failures) / recent_records_.size()) * 100.0;
return failure_rate >= config_.failure_threshold_percent;
}
void CircuitBreaker::transition_to(CircuitState new_state) {
CircuitState old_state = state_.exchange(new_state, std::memory_order_release);
if (old_state != new_state) {
last_state_change_ = std::chrono::system_clock::now();
if (new_state == CircuitState::OPEN) {
open_start_time_ = std::chrono::steady_clock::now();
}
spdlog::info("CircuitBreaker[{}]: State changed {} -> {}",
name_,
static_cast<int>(old_state),
static_cast<int>(new_state));
}
}
CircuitState CircuitBreaker::get_state() const {
return state_.load(std::memory_order_acquire);
}
CircuitBreakerStats CircuitBreaker::get_stats() const {
std::lock_guard<std::mutex> lock(records_mutex_);
CircuitBreakerStats stats;
stats.current_state = get_state();
stats.total_requests = recent_records_.size();
stats.rejected_requests = rejected_count_.load();
stats.last_state_change = last_state_change_;
uint64_t failures = 0;
for (const auto& record : recent_records_) {
if (record.result == RequestResult::SUCCESS) {
++stats.successful_requests;
} else {
++stats.failed_requests;
++failures;
}
}
stats.current_failure_rate = stats.total_requests > 0
? (static_cast<double>(failures) / stats.total_requests) * 100.0
: 0.0;
return stats;
}
void CircuitBreaker::force_open() {
std::lock_guard<std::mutex> lock(state_mutex_);
transition_to(CircuitState::OPEN);
}
void CircuitBreaker::force_closed() {
std::lock_guard<std::mutex> lock(state_mutex_);
{
std::lock_guard<std::mutex> records_lock(records_mutex_);
recent_records_.clear();
}
transition_to(CircuitState::CLOSED);
}
void CircuitBreaker::force_half_open() {
std::lock_guard<std::mutex> lock(state_mutex_);
transition_to(CircuitState::HALF_OPEN);
half_open_requests_ = 0;
}
} // namespace aidc::resilience
限流算法与实现
限流算法对比
漏桶
平滑处理
请求入桶
固定速率流出
令牌桶
固定速率
令牌生成
请求消耗令牌
滑动窗口
平滑过渡
持续滑动
精确计数
固定窗口
重置
窗口1: 0-60s
窗口2: 60-120s
令牌桶限流器
cpp
// rate_limiter.hpp
#pragma once
#include <atomic>
#include <chrono>
#include <mutex>
#include <deque>
namespace aidc::resilience {
// 限流器接口
class RateLimiter {
public:
virtual ~RateLimiter() = default;
// 尝试获取许可
virtual bool try_acquire() = 0;
// 尝试获取多个许可
virtual bool try_acquire(uint32_t permits) = 0;
// 阻塞等待获取许可(带超时)
virtual bool acquire_with_timeout(std::chrono::milliseconds timeout) = 0;
// 获取当前可用许可数
virtual uint32_t available_permits() const = 0;
};
// 令牌桶限流器
class TokenBucketRateLimiter : public RateLimiter {
public:
TokenBucketRateLimiter(uint32_t max_permits,
std::chrono::milliseconds refill_period,
uint32_t refill_amount);
bool try_acquire() override;
bool try_acquire(uint32_t permits) override;
bool acquire_with_timeout(std::chrono::milliseconds timeout) override;
uint32_t available_permits() const override;
private:
void refill();
const uint32_t max_permits_;
const std::chrono::milliseconds refill_period_;
const uint32_t refill_amount_;
mutable std::mutex mutex_;
std::atomic<double> available_tokens_{0};
std::chrono::steady_clock::time_point last_refill_;
};
// 滑动窗口限流器
class SlidingWindowRateLimiter : public RateLimiter {
public:
SlidingWindowRateLimiter(uint32_t max_requests,
std::chrono::milliseconds window_size);
bool try_acquire() override;
bool try_acquire(uint32_t permits) override;
bool acquire_with_timeout(std::chrono::milliseconds timeout) override;
uint32_t available_permits() const override;
private:
void cleanup_old_requests();
const uint32_t max_requests_;
const std::chrono::milliseconds window_size_;
mutable std::mutex mutex_;
std::deque<std::chrono::steady_clock::time_point> requests_;
};
} // namespace aidc::resilience
令牌桶实现
cpp
// token_bucket.cpp
#include "rate_limiter.hpp"
#include <algorithm>
#include <condition_variable>
namespace aidc::resilience {
TokenBucketRateLimiter::TokenBucketRateLimiter(
uint32_t max_permits,
std::chrono::milliseconds refill_period,
uint32_t refill_amount)
: max_permits_(max_permits)
, refill_period_(refill_period)
, refill_amount_(refill_amount)
, last_refill_(std::chrono::steady_clock::now()) {
available_tokens_.store(static_cast<double>(max_permits));
}
void TokenBucketRateLimiter::refill() {
auto now = std::chrono::steady_clock::now();
auto elapsed = now - last_refill_;
if (elapsed >= refill_period_) {
uint64_t periods = elapsed / refill_period_;
double tokens_to_add = periods * refill_amount_;
double current = available_tokens_.load(std::memory_order_relaxed);
double new_tokens = std::min(current + tokens_to_add,
static_cast<double>(max_permits_));
available_tokens_.store(new_tokens, std::memory_order_relaxed);
last_refill_ = now;
}
}
bool TokenBucketRateLimiter::try_acquire() {
return try_acquire(1);
}
bool TokenBucketRateLimiter::try_acquire(uint32_t permits) {
std::lock_guard<std::mutex> lock(mutex_);
refill();
double current = available_tokens_.load(std::memory_order_relaxed);
if (current >= permits) {
available_tokens_.store(current - permits, std::memory_order_relaxed);
return true;
}
return false;
}
bool TokenBucketRateLimiter::acquire_with_timeout(
std::chrono::milliseconds timeout) {
auto deadline = std::chrono::steady_clock::now() + timeout;
while (std::chrono::steady_clock::now() < deadline) {
if (try_acquire()) {
return true;
}
// 计算等待时间
std::lock_guard<std::mutex> lock(mutex_);
refill();
double needed = 1.0 - available_tokens_.load(std::memory_order_relaxed);
if (needed > 0) {
double periods_needed = needed / refill_amount_;
auto wait_time = std::chrono::duration_cast<std::chrono::milliseconds>(
periods_needed * refill_period_);
std::this_thread::sleep_for(std::min(wait_time,
std::chrono::milliseconds(10)));
}
}
return false;
}
uint32_t TokenBucketRateLimiter::available_permits() const {
std::lock_guard<std::mutex> lock(mutex_);
const_cast<TokenBucketRateLimiter*>(this)->refill();
return static_cast<uint32_t>(available_tokens_.load(
std::memory_order_relaxed));
}
} // namespace aidc::resilience
gRPC拦截器实现
熔断拦截器
cpp
// grpc_circuit_breaker_interceptor.hpp
#pragma once
#include <grpcpp/grpcpp.h>
#include "circuit_breaker.hpp"
namespace aidc::grpc {
// 客户端熔断拦截器
class CircuitBreakerInterceptor
: public grpc::experimental::Interceptor {
public:
CircuitBreakerInterceptor(
grpc::experimental::ClientRpcInfo* info,
std::shared_ptr<resilience::CircuitBreaker> circuit_breaker);
void Intercept(grpc::experimental::InterceptorBatchMethods* methods) override;
private:
std::shared_ptr<resilience::CircuitBreaker> circuit_breaker_;
bool intercepted_{false};
};
// 客户端拦截器工厂
class CircuitBreakerInterceptorFactory
: public grpc::experimental::ClientInterceptorFactoryInterface {
public:
explicit CircuitBreakerInterceptorFactory(
std::shared_ptr<resilience::CircuitBreaker> cb)
: circuit_breaker_(cb) {}
grpc::experimental::Interceptor* CreateClientInterceptor(
grpc::experimental::ClientRpcInfo* info) override {
return new CircuitBreakerInterceptor(info, circuit_breaker_);
}
private:
std::shared_ptr<resilience::CircuitBreaker> circuit_breaker_;
};
// 服务端限流拦截器
class RateLimitInterceptor
: public grpc::experimental::Interceptor {
public:
RateLimitInterceptor(
grpc::experimental::ServerRpcInfo* info,
std::shared_ptr<resilience::RateLimiter> rate_limiter);
void Intercept(grpc::experimental::InterceptorBatchMethods* methods) override;
private:
std::shared_ptr<resilience::RateLimiter> rate_limiter_;
};
class RateLimitInterceptorFactory
: public grpc::experimental::ServerInterceptorFactoryInterface {
public:
explicit RateLimitInterceptorFactory(
std::shared_ptr<resilience::RateLimiter> rl)
: rate_limiter_(rl) {}
grpc::experimental::Interceptor* CreateServerInterceptor(
grpc::experimental::ServerRpcInfo* info) override {
return new RateLimitInterceptor(info, rate_limiter_);
}
private:
std::shared_ptr<resilience::RateLimiter> rate_limiter_;
};
} // namespace aidc::grpc
拦截器实现
cpp
// grpc_circuit_breaker_interceptor.cpp
#include "grpc_circuit_breaker_interceptor.hpp"
#include <spdlog/spdlog.h>
namespace aidc::grpc {
CircuitBreakerInterceptor::CircuitBreakerInterceptor(
grpc::experimental::ClientRpcInfo* info,
std::shared_ptr<resilience::CircuitBreaker> circuit_breaker)
: circuit_breaker_(circuit_breaker) {}
void CircuitBreakerInterceptor::Intercept(
grpc::experimental::InterceptorBatchMethods* methods) {
if (methods->QueryInterceptionHookPoint(
grpc::experimental::InterceptionHookPoints::PRE_SEND_INITIAL_METADATA)) {
// 检查熔断器状态
if (circuit_breaker_->get_state() ==
resilience::CircuitState::OPEN) {
// 熔断器打开,快速失败
grpc::Status status(grpc::StatusCode::UNAVAILABLE,
"Circuit breaker is OPEN");
auto* status_ptr = methods->GetRecvStatus();
if (status_ptr) {
*status_ptr = status;
}
spdlog::warn("gRPC call blocked by circuit breaker");
// 跳过后续处理
methods->Hijack();
return;
}
intercepted_ = true;
}
if (methods->QueryInterceptionHookPoint(
grpc::experimental::InterceptionHookPoints::POST_RECV_STATUS)) {
if (intercepted_) {
auto* status = methods->GetRecvStatus();
if (status) {
if (status->ok()) {
circuit_breaker_->record_result(
resilience::RequestResult::SUCCESS);
} else {
// 根据错误码判断是否为失败
auto code = status->error_code();
if (code == grpc::StatusCode::DEADLINE_EXCEEDED ||
code == grpc::StatusCode::UNAVAILABLE ||
code == grpc::StatusCode::INTERNAL) {
circuit_breaker_->record_result(
resilience::RequestResult::FAILURE);
}
}
}
}
}
methods->Proceed();
}
RateLimitInterceptor::RateLimitInterceptor(
grpc::experimental::ServerRpcInfo* info,
std::shared_ptr<resilience::RateLimiter> rate_limiter)
: rate_limiter_(rate_limiter) {}
void RateLimitInterceptor::Intercept(
grpc::experimental::InterceptorBatchMethods* methods) {
if (methods->QueryInterceptionHookPoint(
grpc::experimental::InterceptionHookPoints::PRE_SEND_INITIAL_METADATA)) {
// 尝试获取许可
if (!rate_limiter_->try_acquire()) {
// 限流触发
grpc::Status status(grpc::StatusCode::RESOURCE_EXHAUSTED,
"Rate limit exceeded");
auto* status_ptr = methods->GetRecvStatus();
if (status_ptr) {
*status_ptr = status;
}
spdlog::warn("gRPC call rejected by rate limiter");
methods->Hijack();
return;
}
}
methods->Proceed();
}
} // namespace aidc::grpc
分布式熔断与限流
Redis-based分布式限流
cpp
// distributed_rate_limiter.hpp
#pragma once
#include "rate_limiter.hpp"
#include <sw/redis++/redis.h>
namespace aidc::resilience {
// 分布式滑动窗口限流器
class DistributedSlidingWindowLimiter : public RateLimiter {
public:
DistributedSlidingWindowLimiter(
std::shared_ptr<sw::redis::Redis> redis,
const std::string& key_prefix,
uint32_t max_requests,
std::chrono::milliseconds window_size);
bool try_acquire() override;
bool try_acquire(uint32_t permits) override;
bool acquire_with_timeout(std::chrono::milliseconds timeout) override;
uint32_t available_permits() const override;
private:
std::shared_ptr<sw::redis::Redis> redis_;
std::string key_prefix_;
uint32_t max_requests_;
std::chrono::milliseconds window_size_;
// Lua脚本保证原子性
static constexpr const char* SLIDING_WINDOW_SCRIPT = R"(
local key = KEYS[1]
local window = tonumber(ARGV[1])
local limit = tonumber(ARGV[2])
local now = tonumber(ARGV[3])
-- 清理过期请求
redis.call('ZREMRANGEBYSCORE', key, 0, now - window)
-- 获取当前窗口内的请求数
local current = redis.call('ZCARD', key)
if current < limit then
-- 允许请求,添加时间戳
redis.call('ZADD', key, now, now)
redis.call('EXPIRE', key, math.ceil(window / 1000))
return 1
else
return 0
end
)";
};
} // namespace aidc::resilience
AIDC系统实战案例
模块间熔断配置
cpp
// aidc_circuit_breaker_config.hpp
#pragma once
#include "resilience/circuit_breaker.hpp"
#include "resilience/rate_limiter.hpp"
#include <unordered_map>
#include <string>
namespace aidc {
// AIDC各模块的熔断/限流配置
class AidcResilienceConfig {
public:
// 获取Web->ISOS调用的熔断器
static resilience::CircuitBreakerConfig get_isos_circuit_config() {
return {
.failure_threshold_percent = 60.0, // 60%失败率熔断
.min_request_threshold = 20,
.open_duration = std::chrono::milliseconds(10000),
.half_open_max_requests = 5,
.window_size = std::chrono::milliseconds(60000),
.slow_call_threshold = std::chrono::milliseconds(2000)
};
}
// 获取Web->Connect调用的熔断器
static resilience::CircuitBreakerConfig get_connect_circuit_config() {
return {
.failure_threshold_percent = 50.0,
.min_request_threshold = 10,
.open_duration = std::chrono::milliseconds(5000),
.half_open_max_requests = 3,
.window_size = std::chrono::milliseconds(60000),
.slow_call_threshold = std::chrono::milliseconds(1000)
};
}
// 获取服务端限流配置
static std::unique_ptr<resilience::RateLimiter> get_server_rate_limiter() {
// 每秒1000个请求,突发2000
return std::make_unique<resilience::TokenBucketRateLimiter>(
2000, // 最大令牌数
std::chrono::milliseconds(100), // 每100ms补充
100 // 补充100个令牌
);
}
// 获取设备连接限流配置
static std::unique_ptr<resilience::RateLimiter> get_device_rate_limiter() {
// 每设备每分钟10次连接
return std::make_unique<resilience::SlidingWindowRateLimiter>(
10, // 最大请求数
std::chrono::minutes(1) // 1分钟窗口
);
}
};
} // namespace aidc
Web模块集成示例
cpp
// web_resilience_integration.cpp
#include "aidc_circuit_breaker_config.hpp"
#include "grpc_circuit_breaker_interceptor.hpp"
#include <grpcpp/create_channel.h>
namespace aidc::web {
class ResilientGrpcClientManager {
public:
ResilientGrpcClientManager() {
// 初始化各服务的熔断器
isos_circuit_ = std::make_shared<resilience::CircuitBreaker>(
"isos-service",
AidcResilienceConfig::get_isos_circuit_config());
connect_circuit_ = std::make_shared<resilience::CircuitBreaker>(
"connect-service",
AidcResilienceConfig::get_connect_circuit_config());
// 创建带熔断器的gRPC channel
isos_channel_ = create_channel_with_circuit_breaker(
"localhost:50052", isos_circuit_);
connect_channel_ = create_channel_with_circuit_breaker(
"localhost:50053", connect_circuit_);
}
std::shared_ptr<grpc::Channel> get_isos_channel() { return isos_channel_; }
std::shared_ptr<grpc::Channel> get_connect_channel() { return connect_channel_; }
// 获取熔断器状态(用于监控)
resilience::CircuitBreakerStats get_isos_stats() const {
return isos_circuit_->get_stats();
}
resilience::CircuitBreakerStats get_connect_stats() const {
return connect_circuit_->get_stats();
}
private:
std::shared_ptr<grpc::Channel> create_channel_with_circuit_breaker(
const std::string& target,
std::shared_ptr<resilience::CircuitBreaker> cb) {
std::vector<std::unique_ptr<grpc::experimental::ClientInterceptorFactoryInterface>>
interceptor_creators;
interceptor_creators.push_back(
std::make_unique<grpc::CircuitBreakerInterceptorFactory>(cb));
grpc::ChannelArguments args;
args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, 10000);
args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 5000);
return grpc::experimental::CreateCustomChannelWithInterceptors(
target,
grpc::InsecureChannelCredentials(),
args,
std::move(interceptor_creators));
}
std::shared_ptr<resilience::CircuitBreaker> isos_circuit_;
std::shared_ptr<resilience::CircuitBreaker> connect_circuit_;
std::shared_ptr<grpc::Channel> isos_channel_;
std::shared_ptr<grpc::Channel> connect_channel_;
};
} // namespace aidc::web
监控与告警
熔断器监控指标
cpp
// resilience_metrics.hpp
#pragma once
#include "circuit_breaker.hpp"
#include <prometheus/counter.h>
#include <prometheus/gauge.h>
#include <prometheus/histogram.h>
namespace aidc::resilience {
class ResilienceMetrics {
public:
explicit ResilienceMetrics(prometheus::Registry& registry);
// 记录熔断器状态变化
void record_circuit_state_change(const std::string& name,
CircuitState from,
CircuitState to);
// 记录请求结果
void record_request(const std::string& name,
RequestResult result,
std::chrono::milliseconds duration);
// 记录限流事件
void record_rate_limit(const std::string& name);
// 更新熔断器状态指标
void update_circuit_state(const std::string& name, CircuitState state);
// 更新当前失败率
void update_failure_rate(const std::string& name, double rate);
private:
prometheus::Counter& circuit_transitions_;
prometheus::Counter& total_requests_;
prometheus::Counter& rejected_requests_;
prometheus::Gauge& circuit_state_;
prometheus::Gauge& failure_rate_;
prometheus::Histogram& request_duration_;
};
} // namespace aidc::resilience
最佳实践
1. 熔断器配置建议
cpp
// 针对不同场景的配置模板
namespace circuit_breaker_templates {
// 关键服务配置(低容错)
resilience::CircuitBreakerConfig critical_service() {
return {
.failure_threshold_percent = 30.0, // 快速熔断
.min_request_threshold = 5,
.open_duration = std::chrono::seconds(30),
.half_open_max_requests = 1,
.window_size = std::chrono::seconds(30),
.slow_call_threshold = std::chrono::milliseconds(500)
};
}
// 非关键服务配置(高容错)
resilience::CircuitBreakerConfig non_critical_service() {
return {
.failure_threshold_percent = 80.0, // 容忍更多失败
.min_request_threshold = 50,
.open_duration = std::chrono::seconds(5),
.half_open_max_requests = 10,
.window_size = std::chrono::minutes(5),
.slow_call_threshold = std::chrono::seconds(5)
};
}
// 外部依赖配置(网络不稳定)
resilience::CircuitBreakerConfig external_dependency() {
return {
.failure_threshold_percent = 50.0,
.min_request_threshold = 20,
.open_duration = std::chrono::seconds(60),
.half_open_max_requests = 3,
.window_size = std::chrono::minutes(2),
.slow_call_threshold = std::chrono::seconds(10)
};
}
} // namespace circuit_breaker_templates
2. 降级策略
cpp
// 服务降级处理器
template<typename Response>
class FallbackHandler {
public:
using FallbackFunc = std::function<Response()>;
explicit FallbackHandler(FallbackFunc fallback)
: fallback_(std::move(fallback)) {}
Response execute_with_fallback(
std::function<Response()> primary,
resilience::CircuitBreaker& cb) {
auto result = cb.execute(primary);
if (result) {
return *result;
}
// 熔断器打开,执行降级
spdlog::warn("Circuit breaker open, executing fallback");
return fallback_();
}
private:
FallbackFunc fallback_;
};
// 使用示例
class WeatherServiceClient {
public:
WeatherData get_weather_data(const std::string& station_id) {
auto primary = [&]() { return call_grpc_get_weather(station_id); };
auto fallback = [&]() { return read_cached_weather(station_id); };
FallbackHandler<WeatherData> handler(fallback);
return handler.execute_with_fallback(primary, circuit_breaker_);
}
};
总结
通过实施熔断器和限流机制,AIDC系统的弹性得到了显著提升:
- 故障隔离:熔断器防止故障级联传播
- 过载保护:限流器保护服务免受过载冲击
- 优雅降级:配合降级策略保证核心功能可用
- 自愈能力:自动检测服务恢复并恢复流量
关键配置建议:
| 场景 | 熔断阈值 | 恢复时间 | 限流QPS |
|---|---|---|---|
| 核心服务 | 30% | 30s | 1000 |
| 普通服务 | 50% | 10s | 2000 |
| 外部依赖 | 60% | 60s | 500 |