弹性伸缩设计：AI 应用的自动扩缩容实践

前言

AI 应用的负载通常具有明显的波峰波谷特性。弹性伸缩能帮助我们在保证服务质量的同时，显著降低资源成本。

我在多个 AI 应用中设计过弹性伸缩方案，今天分享一些实践经验。

伸缩策略设计

基于指标的伸缩

python 复制代码

from dataclasses import dataclass
from typing import List, Optional
from enum import Enum
import time

class ScalingDirection(Enum):
    SCALE_UP = "scale_up"
    SCALE_DOWN = "scale_down"
    NO_CHANGE = "no_change"

@dataclass
class ScalingPolicy:
    """伸缩策略"""
    metric_name: str
    threshold_up: float
    threshold_down: float
    adjustment_up: int
    adjustment_down: int
    cooldown_seconds: int

class MetricBasedScaler:
    """基于指标的伸缩器"""
    
    def __init__(self, min_replicas: int, max_replicas: int):
        self.min_replicas = min_replicas
        self.max_replicas = max_replicas
        self.current_replicas = min_replicas
        self.policies: List[ScalingPolicy] = []
        self.last_scale_time = 0
        self.cooldown = 300
    
    def add_policy(self, policy: ScalingPolicy):
        """添加伸缩策略"""
        self.policies.append(policy)
    
    def evaluate(self, metrics: dict) -> ScalingDirection:
        """评估伸缩方向"""
        for policy in self.policies:
            if policy.metric_name not in metrics:
                continue
            
            value = metrics[policy.metric_name]
            
            if value > policy.threshold_up:
                return ScalingDirection.SCALE_UP
            
            if value < policy.threshold_down:
                return ScalingDirection.SCALE_DOWN
        
        return ScalingDirection.NO_CHANGE
    
    def execute(self, direction: ScalingDirection) -> int:
        """执行伸缩"""
        now = time.time()
        
        if now - self.last_scale_time < self.cooldown:
            return self.current_replicas
        
        new_replicas = self.current_replicas
        
        if direction == ScalingDirection.SCALE_UP:
            # 找一个合适的扩容策略
            for policy in self.policies:
                if policy.metric_name in metrics and metrics[policy.metric_name] > policy.threshold_up:
                    new_replicas = min(
                        self.current_replicas + policy.adjustment_up,
                        self.max_replicas
                    )
                    break
        
        elif direction == ScalingDirection.SCALE_DOWN:
            for policy in self.policies:
                if policy.metric_name in metrics and metrics[policy.metric_name] < policy.threshold_down:
                    new_replicas = max(
                        self.current_replicas - policy.adjustment_down,
                        self.min_replicas
                    )
                    break
        
        if new_replicas != self.current_replicas:
            self.current_replicas = new_replicas
            self.last_scale_time = now
        
        return self.current_replicas

# 示例
scaler = MetricBasedScaler(min_replicas=2, max_replicas=20)
scaler.add_policy(ScalingPolicy(
    metric_name="cpu_percent",
    threshold_up=70,
    threshold_down=30,
    adjustment_up=3,
    adjustment_down=1,
    cooldown_seconds=300
))
scaler.add_policy(ScalingPolicy(
    metric_name="queue_length",
    threshold_up=100,
    threshold_down=10,
    adjustment_up=5,
    adjustment_down=2,
    cooldown_seconds=120
))

基于预测的伸缩

python 复制代码

from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

class PredictiveScaler(MetricBasedScaler):
    """预测式伸缩器"""
    
    def __init__(self, min_replicas: int, max_replicas: int):
        super().__init__(min_replicas, max_replicas)
        self.history = []
        self.model = LinearRegression()
        self.is_trained = False
    
    def record_metrics(self, timestamp: float, metrics: dict):
        """记录历史指标"""
        self.history.append({
            "timestamp": timestamp,
            "metrics": metrics
        })
    
    def train_prediction_model(self):
        """训练预测模型"""
        if len(self.history) < 100:
            return False
        
        df = pd.DataFrame([
            {"time": h["timestamp"], "metric": h["metrics"]["cpu_percent"]}
            for h in self.history
        ])
        
        df["time_ordinal"] = df["time"] - df["time"].min()
        X = df[["time_ordinal"]].values
        y = df["metric"].values
        
        self.model.fit(X, y)
        self.is_trained = True
        
        return True
    
    def predict_load(self, future_seconds: float):
        """预测未来负载"""
        if not self.is_trained or len(self.history) < 100:
            return None
        
        last_time = self.history[-1]["timestamp"]
        future_time = last_time + future_seconds
        time_ordinal = future_time - self.history[0]["timestamp"]
        
        X = np.array([[time_ordinal]])
        prediction = self.model.predict(X)
        
        return prediction[0]
    
    def evaluate_predictive(self, metrics: dict):
        """预测式评估"""
        # 先看即时指标
        immediate_action = self.evaluate(metrics)
        
        if immediate_action != ScalingDirection.NO_CHANGE:
            return immediate_action
        
        # 预测未来5分钟
        predicted_load = self.predict_load(300)
        
        if predicted_load and predicted_load > 60:
            return ScalingDirection.SCALE_UP
        
        return ScalingDirection.NO_CHANGE

Kubernetes HPA 配置

水平 Pod 自动伸缩

yaml 复制代码

# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: ai-service-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: ai-service
  minReplicas: 2
  maxReplicas: 20
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
  - type: Pods
    pods:
      metric:
        name: queue_length
      target:
        type: AverageValue
        averageValue: 50
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
      - type: Pods
        value: 5
        periodSeconds: 60
      - type: Percent
        value: 50
        periodSeconds: 60
      selectPolicy: Max
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Pods
        value: 2
        periodSeconds: 60
      selectPolicy: Min

自定义指标实现

python 复制代码

from prometheus_client import Gauge, start_http_server
import time
import random

class CustomMetricsExporter:
    """自定义指标导出器"""
    
    def __init__(self, port=8000):
        self.port = port
        self.queue_length_gauge = Gauge(
            'ai_service_queue_length',
            '当前队列长度'
        )
        self.request_latency_gauge = Gauge(
            'ai_service_request_latency',
            '请求延迟（毫秒）'
        )
        self.gpu_utilization_gauge = Gauge(
            'ai_service_gpu_utilization',
            'GPU 利用率'
        )
    
    def start(self):
        """启动指标服务器"""
        start_http_server(self.port)
    
    def update_queue_length(self, length):
        """更新队列长度"""
        self.queue_length_gauge.set(length)
    
    def update_latency(self, latency_ms):
        """更新延迟"""
        self.request_latency_gauge.set(latency_ms)
    
    def update_gpu_utilization(self, percent):
        """更新 GPU 利用率"""
        self.gpu_utilization_gauge.set(percent)
    
    def simulate_load(self):
        """模拟负载（示例）"""
        while True:
            queue_length = random.randint(0, 200)
            latency = random.randint(50, 500)
            gpu = random.uniform(20, 90)
            
            self.update_queue_length(queue_length)
            self.update_latency(latency)
            self.update_gpu_utilization(gpu)
            
            time.sleep(10)

成本优化策略

混合实例策略

python 复制代码

from enum import Enum

class InstanceType(Enum):
    ON_DEMAND = "on_demand"
    SPOT = "spot"
    RESERVED = "reserved"

class MixedInstanceStrategy:
    """混合实例策略"""
    
    def __init__(self):
        self.base_replicas = 5
        self.spot_replicas = 0
        self.spot_fallback = False
    
    def get_instance_mix(self, target_replicas: int):
        """获取实例组合"""
        if target_replicas <= self.base_replicas:
            return {
                InstanceType.ON_DEMAND: target_replicas,
                InstanceType.SPOT: 0
            }
        
        spot_target = min(target_replicas - self.base_replicas, 20)
        on_demand_target = self.base_replicas
        
        return {
            InstanceType.ON_DEMAND: on_demand_target,
            InstanceType.SPOT: spot_target
        }
    
    def handle_spot_interruption(self, terminated_replicas: int):
        """处理 spot 实例中断"""
        self.spot_fallback = True
        self.spot_replicas = max(0, self.spot_replicas - terminated_replicas)
        
        # 临时使用按需实例替代
        return {
            "status": "falling_back",
            "add_on_demand": terminated_replicas
        }

资源缩放策略

python 复制代码

class ResourceScaler:
    """资源缩放器"""
    
    def __init__(self):
        self.resource_profiles = {
            "small": {"cpu": "2", "memory": "4Gi", "gpu": 0},
            "medium": {"cpu": "4", "memory": "8Gi", "gpu": 1},
            "large": {"cpu": "8", "memory": "16Gi", "gpu": 1},
            "xlarge": {"cpu": "16", "memory": "32Gi", "gpu": 2}
        }
        self.current_profile = "medium"
    
    def select_profile(self, requirements):
        """根据需求选择配置"""
        if requirements.get("latency") < 100:
            return "xlarge"
        elif requirements.get("throughput") > 1000:
            return "large"
        elif requirements.get("throughput") > 100:
            return "medium"
        else:
            return "small"
    
    def generate_k8s_patch(self, profile_name):
        """生成 K8s 资源补丁"""
        profile = self.resource_profiles[profile_name]
        
        return {
            "spec": {
                "template": {
                    "spec": {
                        "containers": [
                            {
                                "name": "ai-service",
                                "resources": {
                                    "requests": {
                                        "cpu": profile["cpu"],
                                        "memory": profile["memory"]
                                    },
                                    "limits": {
                                        "cpu": profile["cpu"],
                                        "memory": profile["memory"]
                                    }
                                }
                            }
                        ]
                    }
                }
            }
        }

总结

弹性伸缩设计要点：

多指标策略：CPU、内存、队列长度等
预测式伸缩：基于历史数据提前规划
混合实例：按需+Spot 降低成本
冷却机制：避免震荡
资源调度：根据需求调整资源配置

实践经验：

从小规模开始验证策略
充分冷却时间避免抖动
设置合理的 max_replicas 防止爆仓
监控伸缩效果，持续优化