弹性伸缩设计:AI 应用的自动扩缩容实践

前言
AI 应用的负载通常具有明显的波峰波谷特性。弹性伸缩能帮助我们在保证服务质量的同时,显著降低资源成本。
我在多个 AI 应用中设计过弹性伸缩方案,今天分享一些实践经验。
伸缩策略设计
基于指标的伸缩
python
from dataclasses import dataclass
from typing import List, Optional
from enum import Enum
import time
class ScalingDirection(Enum):
SCALE_UP = "scale_up"
SCALE_DOWN = "scale_down"
NO_CHANGE = "no_change"
@dataclass
class ScalingPolicy:
"""伸缩策略"""
metric_name: str
threshold_up: float
threshold_down: float
adjustment_up: int
adjustment_down: int
cooldown_seconds: int
class MetricBasedScaler:
"""基于指标的伸缩器"""
def __init__(self, min_replicas: int, max_replicas: int):
self.min_replicas = min_replicas
self.max_replicas = max_replicas
self.current_replicas = min_replicas
self.policies: List[ScalingPolicy] = []
self.last_scale_time = 0
self.cooldown = 300
def add_policy(self, policy: ScalingPolicy):
"""添加伸缩策略"""
self.policies.append(policy)
def evaluate(self, metrics: dict) -> ScalingDirection:
"""评估伸缩方向"""
for policy in self.policies:
if policy.metric_name not in metrics:
continue
value = metrics[policy.metric_name]
if value > policy.threshold_up:
return ScalingDirection.SCALE_UP
if value < policy.threshold_down:
return ScalingDirection.SCALE_DOWN
return ScalingDirection.NO_CHANGE
def execute(self, direction: ScalingDirection) -> int:
"""执行伸缩"""
now = time.time()
if now - self.last_scale_time < self.cooldown:
return self.current_replicas
new_replicas = self.current_replicas
if direction == ScalingDirection.SCALE_UP:
# 找一个合适的扩容策略
for policy in self.policies:
if policy.metric_name in metrics and metrics[policy.metric_name] > policy.threshold_up:
new_replicas = min(
self.current_replicas + policy.adjustment_up,
self.max_replicas
)
break
elif direction == ScalingDirection.SCALE_DOWN:
for policy in self.policies:
if policy.metric_name in metrics and metrics[policy.metric_name] < policy.threshold_down:
new_replicas = max(
self.current_replicas - policy.adjustment_down,
self.min_replicas
)
break
if new_replicas != self.current_replicas:
self.current_replicas = new_replicas
self.last_scale_time = now
return self.current_replicas
# 示例
scaler = MetricBasedScaler(min_replicas=2, max_replicas=20)
scaler.add_policy(ScalingPolicy(
metric_name="cpu_percent",
threshold_up=70,
threshold_down=30,
adjustment_up=3,
adjustment_down=1,
cooldown_seconds=300
))
scaler.add_policy(ScalingPolicy(
metric_name="queue_length",
threshold_up=100,
threshold_down=10,
adjustment_up=5,
adjustment_down=2,
cooldown_seconds=120
))
基于预测的伸缩
python
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
class PredictiveScaler(MetricBasedScaler):
"""预测式伸缩器"""
def __init__(self, min_replicas: int, max_replicas: int):
super().__init__(min_replicas, max_replicas)
self.history = []
self.model = LinearRegression()
self.is_trained = False
def record_metrics(self, timestamp: float, metrics: dict):
"""记录历史指标"""
self.history.append({
"timestamp": timestamp,
"metrics": metrics
})
def train_prediction_model(self):
"""训练预测模型"""
if len(self.history) < 100:
return False
df = pd.DataFrame([
{"time": h["timestamp"], "metric": h["metrics"]["cpu_percent"]}
for h in self.history
])
df["time_ordinal"] = df["time"] - df["time"].min()
X = df[["time_ordinal"]].values
y = df["metric"].values
self.model.fit(X, y)
self.is_trained = True
return True
def predict_load(self, future_seconds: float):
"""预测未来负载"""
if not self.is_trained or len(self.history) < 100:
return None
last_time = self.history[-1]["timestamp"]
future_time = last_time + future_seconds
time_ordinal = future_time - self.history[0]["timestamp"]
X = np.array([[time_ordinal]])
prediction = self.model.predict(X)
return prediction[0]
def evaluate_predictive(self, metrics: dict):
"""预测式评估"""
# 先看即时指标
immediate_action = self.evaluate(metrics)
if immediate_action != ScalingDirection.NO_CHANGE:
return immediate_action
# 预测未来5分钟
predicted_load = self.predict_load(300)
if predicted_load and predicted_load > 60:
return ScalingDirection.SCALE_UP
return ScalingDirection.NO_CHANGE
Kubernetes HPA 配置
水平 Pod 自动伸缩
yaml
# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ai-service-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ai-service
minReplicas: 2
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: queue_length
target:
type: AverageValue
averageValue: 50
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Pods
value: 5
periodSeconds: 60
- type: Percent
value: 50
periodSeconds: 60
selectPolicy: Max
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 2
periodSeconds: 60
selectPolicy: Min
自定义指标实现
python
from prometheus_client import Gauge, start_http_server
import time
import random
class CustomMetricsExporter:
"""自定义指标导出器"""
def __init__(self, port=8000):
self.port = port
self.queue_length_gauge = Gauge(
'ai_service_queue_length',
'当前队列长度'
)
self.request_latency_gauge = Gauge(
'ai_service_request_latency',
'请求延迟(毫秒)'
)
self.gpu_utilization_gauge = Gauge(
'ai_service_gpu_utilization',
'GPU 利用率'
)
def start(self):
"""启动指标服务器"""
start_http_server(self.port)
def update_queue_length(self, length):
"""更新队列长度"""
self.queue_length_gauge.set(length)
def update_latency(self, latency_ms):
"""更新延迟"""
self.request_latency_gauge.set(latency_ms)
def update_gpu_utilization(self, percent):
"""更新 GPU 利用率"""
self.gpu_utilization_gauge.set(percent)
def simulate_load(self):
"""模拟负载(示例)"""
while True:
queue_length = random.randint(0, 200)
latency = random.randint(50, 500)
gpu = random.uniform(20, 90)
self.update_queue_length(queue_length)
self.update_latency(latency)
self.update_gpu_utilization(gpu)
time.sleep(10)
成本优化策略
混合实例策略
python
from enum import Enum
class InstanceType(Enum):
ON_DEMAND = "on_demand"
SPOT = "spot"
RESERVED = "reserved"
class MixedInstanceStrategy:
"""混合实例策略"""
def __init__(self):
self.base_replicas = 5
self.spot_replicas = 0
self.spot_fallback = False
def get_instance_mix(self, target_replicas: int):
"""获取实例组合"""
if target_replicas <= self.base_replicas:
return {
InstanceType.ON_DEMAND: target_replicas,
InstanceType.SPOT: 0
}
spot_target = min(target_replicas - self.base_replicas, 20)
on_demand_target = self.base_replicas
return {
InstanceType.ON_DEMAND: on_demand_target,
InstanceType.SPOT: spot_target
}
def handle_spot_interruption(self, terminated_replicas: int):
"""处理 spot 实例中断"""
self.spot_fallback = True
self.spot_replicas = max(0, self.spot_replicas - terminated_replicas)
# 临时使用按需实例替代
return {
"status": "falling_back",
"add_on_demand": terminated_replicas
}
资源缩放策略
python
class ResourceScaler:
"""资源缩放器"""
def __init__(self):
self.resource_profiles = {
"small": {"cpu": "2", "memory": "4Gi", "gpu": 0},
"medium": {"cpu": "4", "memory": "8Gi", "gpu": 1},
"large": {"cpu": "8", "memory": "16Gi", "gpu": 1},
"xlarge": {"cpu": "16", "memory": "32Gi", "gpu": 2}
}
self.current_profile = "medium"
def select_profile(self, requirements):
"""根据需求选择配置"""
if requirements.get("latency") < 100:
return "xlarge"
elif requirements.get("throughput") > 1000:
return "large"
elif requirements.get("throughput") > 100:
return "medium"
else:
return "small"
def generate_k8s_patch(self, profile_name):
"""生成 K8s 资源补丁"""
profile = self.resource_profiles[profile_name]
return {
"spec": {
"template": {
"spec": {
"containers": [
{
"name": "ai-service",
"resources": {
"requests": {
"cpu": profile["cpu"],
"memory": profile["memory"]
},
"limits": {
"cpu": profile["cpu"],
"memory": profile["memory"]
}
}
}
]
}
}
}
}
总结
弹性伸缩设计要点:
- 多指标策略:CPU、内存、队列长度等
- 预测式伸缩:基于历史数据提前规划
- 混合实例:按需+Spot 降低成本
- 冷却机制:避免震荡
- 资源调度:根据需求调整资源配置
实践经验:
- 从小规模开始验证策略
- 充分冷却时间避免抖动
- 设置合理的 max_replicas 防止爆仓
- 监控伸缩效果,持续优化