引言
在移动端机器学习平台中,模型服务的部署与运维是确保系统稳定可靠运行的关键环节。一个优秀的部署运维平台应该能够自动化模型从开发到生产的整个流程,提供完善的监控、告警、自动扩缩容和故障恢复能力。本文将详细介绍如何构建一个企业级的模型服务部署与运维平台。
- 部署平台架构设计
1.1 系统架构概览
bash
┌─────────────────────────────────────────────────────────────┐
│ 部署运维平台 │
├─────────────────────────────────────────────────────────────┤
│ 持续集成/持续部署 │ 服务编排 │ 监控告警 │ 故障恢复 │ 成本优化 │
├─────────────────────────────────────────────────────────────┤
│ 代码仓库集成 │ 容器编排 │ 指标收集 │ 健康检查 │ 资源调度 │
│ 自动化构建 │ 服务发现 │ 日志聚合 │ 自动回滚 │ 预算管理 │
│ 测试流水线 │ 负载均衡 │ 告警管理 │ 故障隔离 │ 成本分析 │
│ 部署策略 │ 自动扩缩容 │ 性能分析 │ 容灾演练 │ 优化建议 │
└─────────────────────────────────────────────────────────────┘
1.2 核心组件设计
bash
// 部署配置实体
data class DeploymentConfig(
val deploymentId: String, // 部署ID
val name: String, // 部署名称
val description: String?, // 部署描述
// 模型配置
val modelId: String, // 模型ID
val version: String, // 模型版本
val modelPath: String, // 模型路径
val modelType: ModelType, // 模型类型
val runtime: RuntimeType, // 运行时环境
// 资源配置
val resources: ResourceRequirements, // 资源需求
val replicas: Int, // 副本数
val maxReplicas: Int, // 最大副本数
val minReplicas: Int, // 最小副本数
// 部署策略
val deploymentStrategy: DeploymentStrategy, // 部署策略
val updateStrategy: UpdateStrategy, // 更新策略
val rollbackConfig: RollbackConfig, // 回滚配置
// 健康检查
val livenessProbe: HealthProbeConfig, // 存活探针
val readinessProbe: HealthProbeConfig, // 就绪探针
val startupProbe: HealthProbeConfig, // 启动探针
// 监控配置
val metrics: MetricsConfig, // 指标配置
val logging: LoggingConfig, // 日志配置
val tracing: TracingConfig, // 追踪配置
// 网络配置
val servicePort: Int, // 服务端口
val grpcPort: Int, // gRPC端口
val metricsPort: Int, // 指标端口
// 安全配置
val securityContext: SecurityContext, // 安全上下文
val secrets: List<SecretRef>, // 密钥引用
val createdAt: Long, // 创建时间
val updatedAt: Long, // 更新时间
val createdBy: String, // 创建人
val status: DeploymentStatus, // 部署状态
val metadata: Map<String, String> = emptyMap() // 元数据
)
// 部署策略枚举
enum class DeploymentStrategy {
RECREATE, // 重建策略
ROLLING_UPDATE, // 滚动更新
BLUE_GREEN, // 蓝绿部署
CANARY // 金丝雀部署
}
// 资源需求
data class ResourceRequirements(
val cpu: ResourceLimit, // CPU资源
val memory: ResourceLimit, // 内存资源
val gpu: ResourceLimit? = null, // GPU资源
val storage: ResourceLimit? = null // 存储资源
)
data class ResourceLimit(
val request: String, // 请求量
val limit: String // 限制量
)
// 健康检查配置
data class HealthProbeConfig(
val type: ProbeType, // 探针类型
val initialDelaySeconds: Int, // 初始延迟
val periodSeconds: Int, // 检查周期
val timeoutSeconds: Int, // 超时时间
val successThreshold: Int, // 成功阈值
val failureThreshold: Int, // 失败阈值
val endpoint: String? = null, // 检查端点
val command: List<String>? = null // 检查命令
)
// 部署状态
data class DeploymentStatus(
val phase: DeploymentPhase, // 部署阶段
val replicas: Int, // 期望副本数
val readyReplicas: Int, // 就绪副本数
val availableReplicas: Int, // 可用副本数
val updatedReplicas: Int, // 更新副本数
val unavailableReplicas: Int, // 不可用副本数
val conditions: List<DeploymentCondition>, // 状态条件
val events: List<DeploymentEvent>, // 事件列表
val startTime: Long?, // 开始时间
val endTime: Long?, // 结束时间
val duration: Long? = null // 持续时间
)
// 部署条件
data class DeploymentCondition(
val type: ConditionType, // 条件类型
val status: ConditionStatus, // 条件状态
val lastUpdateTime: Long, // 最后更新时间
val lastTransitionTime: Long, // 最后转换时间
val reason: String?, // 原因
val message: String? // 消息
)
- 持续集成/持续部署流水线
2.1 CI/CD流水线实现
bash
class CICDPipeline(
private val gitService: GitService,
private val buildService: BuildService,
private val testService: TestService,
private val deployService: DeployService,
private val registryService: RegistryService
) {
// 触发流水线
suspend fun triggerPipeline(trigger: PipelineTrigger): PipelineResult {
val pipeline = createPipeline(trigger)
return try {
// 1. 拉取代码
val source = checkoutSource(trigger)
pipeline.currentStep = "checkout"
// 2. 代码质量检查
val qualityResult = runQualityChecks(source)
if (!qualityResult.success) {
return PipelineResult.failure("质量检查失败", qualityResult)
}
pipeline.currentStep = "quality"
// 3. 构建
val buildResult = build(source)
if (!buildResult.success) {
return PipelineResult.failure("构建失败", buildResult)
}
pipeline.currentStep = "build"
// 4. 单元测试
val unitTestResult = runUnitTests(buildResult)
if (!unitTestResult.success) {
return PipelineResult.failure("单元测试失败", unitTestResult)
}
pipeline.currentStep = "unit_test"
// 5. 集成测试
val integrationTestResult = runIntegrationTests(buildResult)
if (!integrationTestResult.success) {
return PipelineResult.failure("集成测试失败", integrationTestResult)
}
pipeline.currentStep = "integration_test"
// 6. 构建容器镜像
val imageResult = buildImage(buildResult)
if (!imageResult.success) {
return PipelineResult.failure("镜像构建失败", imageResult)
}
pipeline.currentStep = "docker_build"
// 7. 安全扫描
val securityResult = scanSecurity(imageResult)
if (!securityResult.success) {
return PipelineResult.failure("安全扫描失败", securityResult)
}
pipeline.currentStep = "security_scan"
// 8. 推送到镜像仓库
val pushResult = pushImage(imageResult)
if (!pushResult.success) {
return PipelineResult.failure("镜像推送失败", pushResult)
}
pipeline.currentStep = "docker_push"
// 9. 部署到测试环境
val testDeployResult = deployToTest(pushResult)
if (!testDeployResult.success) {
return PipelineResult.failure("测试环境部署失败", testDeployResult)
}
pipeline.currentStep = "deploy_test"
// 10. 端到端测试
val e2eTestResult = runE2ETests(testDeployResult)
if (!e2eTestResult.success) {
return PipelineResult.failure("端到端测试失败", e2eTestResult)
}
pipeline.currentStep = "e2e_test"
// 11. 部署到生产环境
val prodDeployResult = deployToProduction(testDeployResult, trigger.deploymentStrategy)
if (!prodDeployResult.success) {
return PipelineResult.failure("生产环境部署失败", prodDeployResult)
}
pipeline.currentStep = "deploy_prod"
// 12. 验证部署
val validationResult = validateDeployment(prodDeployResult)
if (!validationResult.success) {
return PipelineResult.failure("部署验证失败", validationResult)
}
pipeline.currentStep = "validate"
pipeline.status = PipelineStatus.SUCCESS
PipelineResult.success(pipeline)
} catch (e: Exception) {
log.error("流水线执行失败", e)
pipeline.status = PipelineStatus.FAILED
pipeline.error = e.message
PipelineResult.failure("流水线执行异常: ${e.message}", pipeline)
} finally {
pipeline.endTime = System.currentTimeMillis()
savePipelineResult(pipeline)
}
}
// Git代码检出
private suspend fun checkoutSource(trigger: PipelineTrigger): SourceCode {
return withContext(Dispatchers.IO) {
val repoUrl = trigger.repository.url
val branch = trigger.branch
val commit = trigger.commit
log.info("开始检出代码: $repoUrl, 分支: $branch, 提交: $commit")
val sourceDir = createTempDir("source_")
// 克隆仓库
gitService.clone(repoUrl, branch, sourceDir.absolutePath)
// 切换到指定提交
if (commit != null) {
gitService.checkout(commit, sourceDir.absolutePath)
}
val sourceCode = SourceCode(
directory = sourceDir,
repository = repoUrl,
branch = branch,
commit = commit ?: gitService.getCurrentCommit(sourceDir.absolutePath),
author = gitService.getAuthor(sourceDir.absolutePath),
timestamp = System.currentTimeMillis()
)
log.info("代码检出完成: ${sourceCode.commit}")
sourceCode
}
}
// 代码质量检查
private suspend fun runQualityChecks(source: SourceCode): QualityResult {
return withContext(Dispatchers.IO) {
val qualityDir = createTempDir("quality_")
val checks = listOf(
QualityCheck("代码风格", "ktlint") {
runLintCheck(source.directory, qualityDir)
},
QualityCheck("安全检查", "detekt") {
runSecurityCheck(source.directory, qualityDir)
},
QualityCheck("依赖检查", "dependency") {
runDependencyCheck(source.directory)
},
QualityCheck("测试覆盖率", "jacoco") {
runCoverageCheck(source.directory)
}
)
val results = checks.map { check ->
try {
val result = check.checker()
QualityCheckResult(
name = check.name,
tool = check.tool,
success = result.success,
score = result.score,
issues = result.issues,
details = result.details
)
} catch (e: Exception) {
QualityCheckResult(
name = check.name,
tool = check.tool,
success = false,
issues = listOf("检查失败: ${e.message}")
)
}
}
val overallSuccess = results.all { it.success }
val averageScore = results.mapNotNull { it.score }.average().takeIf { !it.isNaN() } ?: 0.0
QualityResult(
success = overallSuccess,
score = averageScore,
checks = results,
timestamp = System.currentTimeMillis()
)
}
}
// 构建项目
private suspend fun build(source: SourceCode): BuildResult {
return withContext(Dispatchers.IO) {
val buildDir = createTempDir("build_")
log.info("开始构建项目: ${source.directory.absolutePath}")
val buildConfig = BuildConfig(
sourceDir = source.directory,
outputDir = buildDir,
buildType = "release",
flavor = "prod",
targetSdk = 31,
minSdk = 24,
enableProguard = true,
enableShrinkResources = true
)
val result = buildService.build(buildConfig)
if (result.success) {
log.info("构建成功: ${result.output?.apkPath}")
} else {
log.error("构建失败: ${result.error}")
}
result
}
}
// 运行单元测试
private suspend fun runUnitTests(buildResult: BuildResult): TestResult {
return withContext(Dispatchers.IO) {
val testDir = createTempDir("unit_test_")
val testConfig = TestConfig(
testType = TestType.UNIT,
coverageEnabled = true,
parallelEnabled = true,
testClasses = emptyList(), // 运行所有测试
outputDir = testDir
)
val result = testService.runTests(testConfig)
generateTestReport(result, testDir, "unit")
result
}
}
// 构建Docker镜像
private suspend fun buildImage(buildResult: BuildResult): ImageBuildResult {
return withContext(Dispatchers.IO) {
val dockerfilePath = buildResult.output?.dockerfilePath
?: throw IllegalStateException("Dockerfile not found")
val imageName = generateImageName(buildResult.version)
val buildConfig = ImageBuildConfig(
dockerfile = File(dockerfilePath),
context = File(dockerfilePath).parentFile,
imageName = imageName,
tags = listOf("latest", buildResult.version),
buildArgs = emptyMap(),
platform = "linux/amd64,linux/arm64"
)
registryService.buildImage(buildConfig)
}
}
// 部署到测试环境
private suspend fun deployToTest(imageResult: ImageBuildResult): DeployResult {
return withContext(Dispatchers.IO) {
val deploymentConfig = DeploymentConfig(
deploymentId = generateDeploymentId("test"),
name = "${imageResult.imageName}-test",
namespace = "test",
image = imageResult.imageWithTag,
replicas = 2,
env = mapOf(
"ENVIRONMENT" to "test",
"LOG_LEVEL" to "debug"
),
resources = ResourceRequirements(
cpu = ResourceLimit("100m", "500m"),
memory = ResourceLimit("256Mi", "1Gi")
)
)
deployService.deploy(deploymentConfig)
}
}
// 部署到生产环境
private suspend fun deployToProduction(
testResult: DeployResult,
strategy: DeploymentStrategy
): DeployResult {
return withContext(Dispatchers.IO) {
val deploymentConfig = DeploymentConfig(
deploymentId = generateDeploymentId("prod"),
name = testResult.deployment.name.replace("-test", "-prod"),
namespace = "production",
image = testResult.deployment.image,
replicas = 4,
deploymentStrategy = strategy,
env = mapOf(
"ENVIRONMENT" to "production",
"LOG_LEVEL" to "info"
),
resources = ResourceRequirements(
cpu = ResourceLimit("200m", "1"),
memory = ResourceLimit("512Mi", "2Gi")
),
healthChecks = HealthCheckConfig(
livenessProbe = HealthProbe(
path = "/health",
initialDelaySeconds = 30,
periodSeconds = 10
),
readinessProbe = HealthProbe(
path = "/ready",
initialDelaySeconds = 5,
periodSeconds = 5
)
)
)
deployService.deploy(deploymentConfig)
}
}
// 辅助方法
private fun generateImageName(version: String): String {
val project = System.getenv("CI_PROJECT_NAME") ?: "model-service"
val registry = System.getenv("CI_REGISTRY") ?: "registry.example.com"
return "$registry/$project:$version"
}
private fun generateDeploymentId(environment: String): String {
val timestamp = SimpleDateFormat("yyyyMMddHHmmss").format(Date())
val random = Random.nextInt(1000, 9999)
return "deploy-$environment-$timestamp-$random"
}
}
- 容器编排与部署
3.1 Kubernetes部署管理器
bash
class KubernetesDeployer(
private val k8sClient: KubernetesClient,
private val configManager: ConfigManager
) {
// 部署应用
suspend fun deploy(deploymentConfig: DeploymentConfig): DeployResult {
return withContext(Dispatchers.IO) {
try {
log.info("开始部署: ${deploymentConfig.name}")
// 1. 创建命名空间(如果不存在)
createNamespaceIfNotExists(deploymentConfig.namespace)
// 2. 创建配置映射
val configMap = createConfigMap(deploymentConfig)
k8sClient.configMaps().inNamespace(deploymentConfig.namespace)
.createOrReplace(configMap)
// 3. 创建密钥
deploymentConfig.secrets.forEach { secret ->
val k8sSecret = createSecret(secret, deploymentConfig.namespace)
k8sClient.secrets().inNamespace(deploymentConfig.namespace)
.createOrReplace(k8sSecret)
}
// 4. 创建服务
val service = createService(deploymentConfig)
k8sClient.services().inNamespace(deploymentConfig.namespace)
.createOrReplace(service)
// 5. 创建部署
val deployment = createDeployment(deploymentConfig)
k8sClient.apps().deployments().inNamespace(deploymentConfig.namespace)
.createOrReplace(deployment)
// 6. 创建HPA(如果启用了自动扩缩容)
if (deploymentConfig.maxReplicas > deploymentConfig.replicas) {
val hpa = createHPA(deploymentConfig)
k8sClient.autoscaling().v1()
.horizontalPodAutoscalers()
.inNamespace(deploymentConfig.namespace)
.createOrReplace(hpa)
}
// 7. 创建Ingress(如果配置了域名)
if (deploymentConfig.ingress != null) {
val ingress = createIngress(deploymentConfig)
k8sClient.network().v1()
.ingresses()
.inNamespace(deploymentConfig.namespace)
.createOrReplace(ingress)
}
// 8. 等待部署就绪
val isReady = waitForDeploymentReady(
deploymentConfig.namespace,
deploymentConfig.name,
deploymentConfig.replicas
)
if (isReady) {
val endpoints = getServiceEndpoints(
deploymentConfig.namespace,
deploymentConfig.name
)
log.info("部署完成: ${deploymentConfig.name}")
DeployResult.success(deploymentConfig, endpoints)
} else {
log.error("部署未就绪: ${deploymentConfig.name}")
DeployResult.failure("部署未就绪")
}
} catch (e: Exception) {
log.error("部署失败: ${deploymentConfig.name}", e)
DeployResult.failure("部署失败: ${e.message}")
}
}
}
// 创建部署资源
private fun createDeployment(config: DeploymentConfig): Deployment {
return DeploymentBuilder()
.withApiVersion("apps/v1")
.withKind("Deployment")
.withNewMetadata()
.withName(config.name)
.withNamespace(config.namespace)
.withLabels(getLabels(config))
.withAnnotations(getAnnotations(config))
.endMetadata()
.withNewSpec()
.withReplicas(config.replicas)
.withSelector(LabelSelectorBuilder()
.withMatchLabels(getSelectorLabels(config))
.build())
.withStrategy(createDeploymentStrategy(config.deploymentStrategy))
.withNewTemplate()
.withNewMetadata()
.withLabels(getPodLabels(config))
.withAnnotations(getPodAnnotations(config))
.endMetadata()
.withNewSpec()
.withContainers(createContainers(config))
.withInitContainers(createInitContainers(config))
.withVolumes(createVolumes(config))
.withImagePullSecrets(createImagePullSecrets(config))
.withNodeSelector(createNodeSelector(config))
.withAffinity(createAffinity(config))
.withTolerations(createTolerations(config))
.withSecurityContext(createPodSecurityContext(config.securityContext))
.endSpec()
.endTemplate()
.endSpec()
.build()
}
// 创建容器配置
private fun createContainers(config: DeploymentConfig): List<Container> {
val container = ContainerBuilder()
.withName(config.name)
.withImage(config.image)
.withImagePullPolicy("IfNotPresent")
.withPorts(createContainerPorts(config))
.withEnv(createEnvVars(config))
.withEnvFrom(createEnvFroms(config))
.withResources(createResourceRequirements(config.resources))
.withLivenessProbe(createProbe(config.livenessProbe))
.withReadinessProbe(createProbe(config.readinessProbe))
.withStartupProbe(createProbe(config.startupProbe))
.withVolumeMounts(createVolumeMounts(config))
.withSecurityContext(createContainerSecurityContext(config.securityContext))
.build()
return listOf(container)
}
// 创建资源需求
private fun createResourceRequirements(resources: ResourceRequirements): io.fabric8.kubernetes.api.model.ResourceRequirements {
val requests = mutableMapOf<String, Quantity>()
val limits = mutableMapOf<String, Quantity>()
requests["cpu"] = Quantity(resources.cpu.request)
requests["memory"] = Quantity(resources.memory.request)
limits["cpu"] = Quantity(resources.cpu.limit)
limits["memory"] = Quantity(resources.memory.limit)
resources.gpu?.let { gpu ->
requests["nvidia.com/gpu"] = Quantity(gpu.request)
limits["nvidia.com/gpu"] = Quantity(gpu.limit)
}
return io.fabric8.kubernetes.api.model.ResourceRequirementsBuilder()
.withRequests(requests)
.withLimits(limits)
.build()
}
// 创建健康检查探针
private fun createProbe(probeConfig: HealthProbeConfig?): Probe? {
if (probeConfig == null) return null
val probeBuilder = ProbeBuilder()
when (probeConfig.type) {
ProbeType.HTTP -> {
probeConfig.endpoint?.let { endpoint ->
val (scheme, host, port, path) = parseEndpoint(endpoint)
val httpGet = HTTPGetActionBuilder()
.withScheme(scheme)
.withHost(host)
.withPort(IntOrString(port))
.withPath(path)
.build()
probeBuilder.withNewHttpGet()
.withScheme(httpGet.scheme)
.withHost(httpGet.host)
.withPort(httpGet.port)
.withPath(httpGet.path)
.withHttpHeaders(httpGet.httpHeaders)
.endHttpGet()
}
}
ProbeType.TCP -> {
probeConfig.endpoint?.let { endpoint ->
val (_, host, port, _) = parseEndpoint(endpoint)
probeBuilder.withNewTcpSocket()
.withHost(host)
.withPort(IntOrString(port))
.endTcpSocket()
}
}
ProbeType.EXEC -> {
probeConfig.command?.let { command ->
probeBuilder.withNewExec()
.withCommand(command)
.endExec()
}
}
}
return probeBuilder
.withInitialDelaySeconds(probeConfig.initialDelaySeconds)
.withPeriodSeconds(probeConfig.periodSeconds)
.withTimeoutSeconds(probeConfig.timeoutSeconds)
.withSuccessThreshold(probeConfig.successThreshold)
.withFailureThreshold(probeConfig.failureThreshold)
.build()
}
// 等待部署就绪
private suspend fun waitForDeploymentReady(
namespace: String,
name: String,
expectedReplicas: Int,
timeoutSeconds: Long = 300
): Boolean {
return withContext(Dispatchers.IO) {
val startTime = System.currentTimeMillis()
val timeoutMillis = timeoutSeconds * 1000
while (System.currentTimeMillis() - startTime < timeoutMillis) {
try {
val deployment = k8sClient.apps().deployments()
.inNamespace(namespace)
.withName(name)
.get()
if (deployment == null) {
log.warn("Deployment not found: $name")
delay(1000)
continue
}
val specReplicas = deployment.spec.replicas
val statusReplicas = deployment.status.replicas
val readyReplicas = deployment.status.readyReplicas
val updatedReplicas = deployment.status.updatedReplicas
val availableReplicas = deployment.status.availableReplicas
log.info("Deployment status: spec=$specReplicas, " +
"status=$statusReplicas, ready=$readyReplicas, " +
"updated=$updatedReplicas, available=$availableReplicas")
if (readyReplicas == expectedReplicas &&
updatedReplicas == expectedReplicas &&
availableReplicas == expectedReplicas) {
log.info("Deployment is ready: $name")
return@withContext true
}
// 检查部署条件
val conditions = deployment.status?.conditions ?: emptyList()
val progressingCondition = conditions.find { it.type == "Progressing" }
val availableCondition = conditions.find { it.type == "Available" }
if (progressingCondition?.status == "False" ||
availableCondition?.status == "False") {
log.error("Deployment failed: $name, " +
"progressing=${progressingCondition?.status}, " +
"available=${availableCondition?.status}")
return@withContext false
}
} catch (e: Exception) {
log.error("Error checking deployment status", e)
}
delay(3000) // 等待3秒再次检查
}
log.error("Deployment timeout: $name")
false
}
}
// 回滚部署
suspend fun rollback(
namespace: String,
name: String,
revision: Int? = null
): RollbackResult {
return withContext(Dispatchers.IO) {
try {
log.info("开始回滚部署: $name, revision=$revision")
val deployment = k8sClient.apps().deployments()
.inNamespace(namespace)
.withName(name)
if (revision != null) {
// 回滚到指定版本
deployment.rollback()
.toRevision(revision.toLong())
.rollback()
} else {
// 回滚到上一个版本
val currentRevision = getCurrentRevision(namespace, name)
val previousRevision = getPreviousRevision(namespace, name, currentRevision)
if (previousRevision != null) {
deployment.rollback()
.toRevision(previousRevision)
.rollback()
} else {
return@withContext RollbackResult.failure("没有可用的回滚版本")
}
}
// 等待回滚完成
val isReady = waitForDeploymentReady(namespace, name, 1)
if (isReady) {
log.info("回滚完成: $name")
RollbackResult.success(name, revision)
} else {
RollbackResult.failure("回滚后部署未就绪")
}
} catch (e: Exception) {
log.error("回滚失败: $name", e)
RollbackResult.failure("回滚失败: ${e.message}")
}
}
}
// 伸缩部署
suspend fun scale(
namespace: String,
name: String,
replicas: Int
): ScaleResult {
return withContext(Dispatchers.IO) {
try {
log.info("开始伸缩部署: $name, replicas=$replicas")
k8sClient.apps().deployments()
.inNamespace(namespace)
.withName(name)
.scale(replicas)
// 等待伸缩完成
val isReady = waitForDeploymentReady(namespace, name, replicas)
if (isReady) {
log.info("伸缩完成: $name, replicas=$replicas")
ScaleResult.success(name, replicas)
} else {
ScaleResult.failure("伸缩后部署未就绪")
}
} catch (e: Exception) {
log.error("伸缩失败: $name", e)
ScaleResult.failure("伸缩失败: ${e.message}")
}
}
}
}
- 监控告警系统
4.1 指标收集与监控
bash
class MonitoringSystem(
private val metricsCollector: MetricsCollector,
private val alertManager: AlertManager,
private val timeSeriesDB: TimeSeriesDB
) {
// 监控指标定义
data class MetricDefinition(
val name: String, // 指标名称
val type: MetricType, // 指标类型
val help: String, // 指标说明
val labels: List<String> = emptyList(), // 标签列表
val unit: MetricUnit? = null, // 指标单位
val aggregation: Aggregation? = null, // 聚合方式
val alertThresholds: List<AlertThreshold> = emptyList() // 告警阈值
)
// 预定义指标
private val predefinedMetrics = listOf(
// 系统指标
MetricDefinition(
name = "cpu_usage",
type = MetricType.GAUGE,
help = "CPU使用率",
unit = MetricUnit.PERCENT,
alertThresholds = listOf(
AlertThreshold(90.0, AlertSeverity.WARNING),
AlertThreshold(95.0, AlertSeverity.CRITICAL)
)
),
MetricDefinition(
name = "memory_usage",
type = MetricType.GAUGE,
help = "内存使用率",
unit = MetricUnit.PERCENT,
alertThresholds = listOf(
AlertThreshold(85.0, AlertSeverity.WARNING),
AlertThreshold(95.0, AlertSeverity.CRITICAL)
)
),
// 应用指标
MetricDefinition(
name = "http_requests_total",
type = MetricType.COUNTER,
help = "HTTP请求总数",
labels = listOf("method", "path", "status")
),
MetricDefinition(
name = "http_request_duration_seconds",
type = MetricType.HISTOGRAM,
help = "HTTP请求延迟",
labels = listOf("method", "path"),
unit = MetricUnit.SECONDS
),
MetricDefinition(
name = "http_request_errors_total",
type = MetricType.COUNTER,
help = "HTTP错误请求数",
labels = listOf("method", "path", "error")
),
// 业务指标
MetricDefinition(
name = "model_inference_total",
type = MetricType.COUNTER,
help = "模型推理总数",
labels = listOf("model_id", "version", "status")
),
MetricDefinition(
name = "model_inference_duration_ms",
type = MetricType.HISTOGRAM,
help = "模型推理延迟",
labels = listOf("model_id", "version"),
unit = MetricUnit.MILLISECONDS
),
MetricDefinition(
name = "model_inference_errors_total",
type = MetricType.COUNTER,
help = "模型推理错误数",
labels = listOf("model_id", "version", "error_type")
),
// 实验指标
MetricDefinition(
name = "experiment_assignments_total",
type = MetricType.COUNTER,
help = "实验分配总数",
labels = listOf("experiment_id", "version")
),
MetricDefinition(
name = "experiment_conversion_rate",
type = MetricType.GAUGE,
help = "实验转化率",
labels = listOf("experiment_id", "version"),
unit = MetricUnit.PERCENT
),
// 性能指标
MetricDefinition(
name = "gpu_utilization",
type = MetricType.GAUGE,
help = "GPU利用率",
unit = MetricUnit.PERCENT,
alertThresholds = listOf(
AlertThreshold(80.0, AlertSeverity.WARNING),
AlertThreshold(90.0, AlertSeverity.CRITICAL)
)
),
MetricDefinition(
name = "network_bandwidth",
type = MetricType.GAUGE,
help = "网络带宽使用率",
unit = MetricUnit.PERCENT
),
MetricDefinition(
name = "disk_usage",
type = MetricType.GAUGE,
help = "磁盘使用率",
unit = MetricUnit.PERCENT,
alertThresholds = listOf(
AlertThreshold(85.0, AlertSeverity.WARNING),
AlertThreshold(95.0, AlertSeverity.CRITICAL)
)
)
)
// 初始化监控
suspend fun initializeMonitoring(namespace: String, deploymentName: String) {
// 注册自定义指标
predefinedMetrics.forEach { metric ->
metricsCollector.registerMetric(
MetricRegistration(
name = metric.name,
type = metric.type,
help = metric.help,
labels = metric.labels
)
)
// 配置告警规则
metric.alertThresholds.forEach { threshold ->
alertManager.createAlertRule(
AlertRule(
name = "${metric.name}_threshold",
expr = "${metric.name}${metric.unit?.let { "_${it}" } ?: ""} > ${threshold.value}",
duration = "5m",
labels = mapOf(
"severity" to threshold.severity.toString(),
"metric" to metric.name,
"namespace" to namespace,
"deployment" to deploymentName
),
annotations = mapOf(
"description" to "${metric.help}超过阈值 ${threshold.value}${metric.unit?.symbol ?: ""}",
"summary" to "${metric.name}过高"
)
)
)
}
}
// 启动指标收集
startMetricsCollection(namespace, deploymentName)
// 启动健康检查
startHealthChecks(namespace, deploymentName)
// 启动日志聚合
startLogAggregation(namespace, deploymentName)
log.info("监控系统初始化完成: namespace=$namespace, deployment=$deploymentName")
}
// 收集指标
suspend fun collectMetrics(namespace: String, deploymentName: String) {
while (true) {
try {
// 收集系统指标
collectSystemMetrics(namespace, deploymentName)
// 收集应用指标
collectApplicationMetrics(namespace, deploymentName)
// 收集业务指标
collectBusinessMetrics(namespace, deploymentName)
// 发送指标到时序数据库
sendMetricsToTSDB()
delay(collectIntervalMillis)
} catch (e: Exception) {
log.error("收集指标失败", e)
delay(errorRetryDelayMillis)
}
}
}
// 收集系统指标
private suspend fun collectSystemMetrics(namespace: String, deploymentName: String) {
val pods = kubernetesClient.pods().inNamespace(namespace)
.withLabels(mapOf("app" to deploymentName))
.list().items
pods.forEach { pod ->
val podName = pod.metadata.name
// CPU使用率
val cpuUsage = getPodCPUUsage(namespace, podName)
metricsCollector.recordMetric(
MetricSample(
name = "cpu_usage",
value = cpuUsage,
labels = mapOf(
"namespace" to namespace,
"pod" to podName,
"deployment" to deploymentName
),
timestamp = System.currentTimeMillis()
)
)
// 内存使用率
val memoryUsage = getPodMemoryUsage(namespace, podName)
metricsCollector.recordMetric(
MetricSample(
name = "memory_usage",
value = memoryUsage,
labels = mapOf(
"namespace" to namespace,
"pod" to podName,
"deployment" to deploymentName
),
timestamp = System.currentTimeMillis()
)
)
// 网络指标
val networkStats = getPodNetworkStats(namespace, podName)
metricsCollector.recordMetric(
MetricSample(
name = "network_receive_bytes",
value = networkStats.receiveBytes,
labels = mapOf(
"namespace" to namespace,
"pod" to podName
),
timestamp = System.currentTimeMillis()
)
)
metricsCollector.recordMetric(
MetricSample(
name = "network_transmit_bytes",
value = networkStats.transmitBytes,
labels = mapOf(
"namespace" to namespace,
"pod" to podName
),
timestamp = System.currentTimeMillis()
)
)
}
}
// 收集应用指标
private suspend fun collectApplicationMetrics(namespace: String, deploymentName: String) {
val service = getServiceByName(namespace, deploymentName)
if (service == null) {
log.warn("Service not found: $deploymentName in $namespace")
return
}
val serviceUrl = "http://${service.spec.clusterIP}:${service.spec.ports[0].port}"
// 从应用端点获取指标
val metrics = try {
val response = httpClient.get("$serviceUrl/metrics")
if (response.isSuccessful) {
parsePrometheusMetrics(response.body?.string() ?: "")
} else {
emptyList()
}
} catch (e: Exception) {
log.error("获取应用指标失败", e)
emptyList()
}
// 记录应用指标
metrics.forEach { metric ->
metricsCollector.recordMetric(metric)
}
}
// 查询指标
suspend fun queryMetrics(
query: String,
startTime: Long,
endTime: Long,
step: String
): List<TimeSeries> {
return withContext(Dispatchers.IO) {
timeSeriesDB.query(
query = query,
start = startTime,
end = endTime,
step = step
)
}
}
// 获取指标告警
suspend fun getMetricAlerts(
namespace: String? = null,
deploymentName: String? = null,
severity: AlertSeverity? = null,
startTime: Long? = null,
endTime: Long? = null
): List<Alert> {
return withContext(Dispatchers.IO) {
val filter = AlertFilter(
namespace = namespace,
deployment = deploymentName,
severity = severity,
startTime = startTime,
endTime = endTime
)
alertManager.getAlerts(filter)
}
}
// 创建监控面板
fun createDashboard(
namespace: String,
deploymentName: String
): Dashboard {
return Dashboard(
title = "$deploymentName 监控面板",
panels = listOf(
// CPU使用率面板
Panel(
title = "CPU使用率",
type = PanelType.GRAPH,
metrics = listOf(
MetricQuery(
expr = "cpu_usage{namespace='$namespace',deployment='$deploymentName'}",
legendFormat = "{{pod}}"
)
),
unit = "percent",
thresholds = listOf(
Threshold(value = 80.0, color = "orange"),
Threshold(value = 90.0, color = "red")
)
),
// 内存使用率面板
Panel(
title = "内存使用率",
type = PanelType.GRAPH,
metrics = listOf(
MetricQuery(
expr = "memory_usage{namespace='$namespace',deployment='$deploymentName'}",
legendFormat = "{{pod}}"
)
),
unit = "percent",
thresholds = listOf(
Threshold(value = 80.0, color = "orange"),
Threshold(value = 90.0, color = "red")
)
),
// 请求延迟面板
Panel(
title = "HTTP请求延迟(P99)",
type = PanelType.GRAPH,
metrics = listOf(
MetricQuery(
expr = "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, method, path))",
legendFormat = "{{method}} {{path}}"
)
),
unit = "seconds"
),
// 错误率面板
Panel(
title = "HTTP错误率",
type = PanelType.GRAPH,
metrics = listOf(
MetricQuery(
expr = "rate(http_request_errors_total[5m]) / rate(http_requests_total[5m]) * 100",
legendFormat = "{{method}} {{path}}"
)
),
unit = "percent"
),
// 模型推理延迟面板
Panel(
title = "模型推理延迟(P95)",
type = PanelType.GRAPH,
metrics = listOf(
MetricQuery(
expr = "histogram_quantile(0.95, sum(rate(model_inference_duration_ms_bucket[5m])) by (le, model_id, version))",
legendFormat = "{{model_id}} {{version}}"
)
),
unit = "milliseconds"
),
// 实验转化率面板
Panel(
title = "实验转化率",
type = PanelType.BAR_CHART,
metrics = listOf(
MetricQuery(
expr = "experiment_conversion_rate{namespace='$namespace',deployment='$deploymentName'}",
legendFormat = "{{experiment_id}} {{version}}"
)
),
unit = "percent"
)
),
refreshInterval = "30s",
timeRange = TimeRange(
from = "now-1h",
to = "now"
)
)
}
}
- 自动扩缩容策略
5.1 基于指标的HPA控制器
bash
class AutoScaler(
private val k8sClient: KubernetesClient,
private val metricsClient: MetricsClient,
private val scalingHistory: ScalingHistory
) {
// 自动扩缩容配置
data class ScalingConfig(
val namespace: String,
val deploymentName: String,
val minReplicas: Int,
val maxReplicas: Int,
val targetCPUUtilization: Int = 70,
val targetMemoryUtilization: Int = 80,
val targetQPS: Int? = null,
val scalingCooldown: Long = 300, // 冷却时间(秒)
val stabilizationWindow: Long = 300, // 稳定窗口(秒)
val scalingPolicies: List<ScalingPolicy> = emptyList()
)
data class ScalingPolicy(
val type: ScalingType,
val value: Int,
val periodSeconds: Int
)
// 检查并执行扩缩容
suspend fun checkAndScale(): List<ScalingResult> {
val results = mutableListOf<ScalingResult>()
val scalingConfigs = getScalingConfigs()
for (config in scalingConfigs) {
try {
val result = checkAndScaleDeployment(config)
results.add(result)
} catch (e: Exception) {
log.error("检查扩缩容失败: ${config.namespace}/${config.deploymentName}", e)
results.add(ScalingResult.error(config, e.message ?: "未知错误"))
}
}
return results
}
// 检查并伸缩单个部署
private suspend fun checkAndScaleDeployment(config: ScalingConfig): ScalingResult {
val currentReplicas = getCurrentReplicas(config.namespace, config.deploymentName)
val desiredReplicas = calculateDesiredReplicas(config, currentReplicas)
if (desiredReplicas == currentReplicas) {
return ScalingResult.noop(config, currentReplicas)
}
// 检查冷却时间
if (!isCooledDown(config, currentReplicas, desiredReplicas)) {
return ScalingResult.cooldown(config, currentReplicas, desiredReplicas)
}
// 检查稳定窗口
if (!isStable(config, currentReplicas, desiredReplicas)) {
return ScalingResult.unstable(config, currentReplicas, desiredReplicas)
}
// 执行伸缩
val scaleResult = scaleDeployment(config, desiredReplicas)
// 记录伸缩历史
scalingHistory.recordScaling(
namespace = config.namespace,
deployment = config.deploymentName,
fromReplicas = currentReplicas,
toReplicas = desiredReplicas,
reason = scaleResult.reason
)
return scaleResult
}
// 计算期望副本数
private suspend fun calculateDesiredReplicas(
config: ScalingConfig,
currentReplicas: Int
): Int {
val metrics = getCurrentMetrics(config)
val desiredByMetrics = calculateDesiredByMetrics(config, metrics)
val desiredByPolicies = calculateDesiredByPolicies(config, metrics)
// 取最大值,确保满足所有条件
val desired = maxOf(desiredByMetrics, desiredByPolicies)
// 确保在最小和最大副本数范围内
return desired.coerceIn(config.minReplicas, config.maxReplicas)
}
// 基于指标计算期望副本数
private fun calculateDesiredByMetrics(
config: ScalingConfig,
metrics: DeploymentMetrics
): Int {
val desiredReplicas = mutableListOf<Int>()
// CPU指标
if (metrics.cpuUtilization > 0) {
val cpuReplicas = ceil(currentReplicas * metrics.cpuUtilization / config.targetCPUUtilization).toInt()
desiredReplicas.add(cpuReplicas)
}
// 内存指标
if (metrics.memoryUtilization > 0) {
val memoryReplicas = ceil(currentReplicas * metrics.memoryUtilization / config.targetMemoryUtilization).toInt()
desiredReplicas.add(memoryReplicas)
}
// QPS指标
if (metrics.qps > 0 && config.targetQPS != null) {
val qpsReplicas = ceil(metrics.qps / config.targetQPS).toInt()
desiredReplicas.add(qpsReplicas)
}
return if (desiredReplicas.isNotEmpty()) desiredReplicas.max() else currentReplicas
}
// 基于策略计算期望副本数
private fun calculateDesiredByPolicies(
config: ScalingConfig,
metrics: DeploymentMetrics
): Int {
var desired = config.minReplicas
config.scalingPolicies.forEach { policy ->
when (policy.type) {
ScalingType.CPU -> {
if (metrics.cpuUtilization > policy.value) {
val policyReplicas = ceil(currentReplicas * metrics.cpuUtilization / policy.value).toInt()
desired = maxOf(desired, policyReplicas)
}
}
ScalingType.MEMORY -> {
if (metrics.memoryUtilization > policy.value) {
val policyReplicas = ceil(currentReplicas * metrics.memoryUtilization / policy.value).toInt()
desired = maxOf(desired, policyReplicas)
}
}
ScalingType.QPS -> {
if (metrics.qps > policy.value) {
val policyReplicas = ceil(metrics.qps / policy.value).toInt()
desired = maxOf(desired, policyReplicas)
}
}
ScalingType.SCHEDULE -> {
// 基于时间表的伸缩
if (isInScheduleTime()) {
desired = maxOf(desired, policy.value)
}
}
}
}
return desired
}
// 获取当前指标
private suspend fun getCurrentMetrics(config: ScalingConfig): DeploymentMetrics {
val namespace = config.namespace
val deployment = config.deploymentName
val cpuUtilization = metricsClient.getCPUUtilization(namespace, deployment)
val memoryUtilization = metricsClient.getMemoryUtilization(namespace, deployment)
val qps = metricsClient.getQPS(namespace, deployment)
return DeploymentMetrics(
cpuUtilization = cpuUtilization,
memoryUtilization = memoryUtilization,
qps = qps
)
}
// 检查冷却时间
private fun isCooledDown(
config: ScalingConfig,
currentReplicas: Int,
desiredReplicas: Int
): Boolean {
val lastScaling = scalingHistory.getLastScaling(
config.namespace,
config.deploymentName
)
if (lastScaling == null) {
return true
}
val timeSinceLastScaling = System.currentTimeMillis() - lastScaling.timestamp
val direction = if (desiredReplicas > currentReplicas) "scale-up" else "scale-down"
// 扩展冷却时间较短,收缩冷却时间较长
val cooldown = if (direction == "scale-up") {
config.scalingCooldown * 1000
} else {
config.scalingCooldown * 2 * 1000
}
return timeSinceLastScaling > cooldown
}
// 检查稳定窗口
private fun isStable(
config: ScalingConfig,
currentReplicas: Int,
desiredReplicas: Int
): Boolean {
val recentScalings = scalingHistory.getRecentScalings(
config.namespace,
config.deploymentName,
config.stabilizationWindow
)
if (recentScalings.size < 2) {
return true
}
// 检查最近的伸缩是否稳定
val recentChanges = recentScalings.map { it.toReplicas - it.fromReplicas }
val averageChange = recentChanges.average()
val changeStdDev = sqrt(recentChanges.map { (it - averageChange) * (it - averageChange) }.average())
// 如果变化的标准差较大,说明不稳定
return changeStdDev < 0.5
}
// 执行伸缩
private suspend fun scaleDeployment(
config: ScalingConfig,
desiredReplicas: Int
): ScalingResult {
val currentReplicas = getCurrentReplicas(config.namespace, config.deploymentName)
return try {
k8sClient.apps().deployments()
.inNamespace(config.namespace)
.withName(config.deploymentName)
.scale(desiredReplicas)
ScalingResult.success(
config = config,
fromReplicas = currentReplicas,
toReplicas = desiredReplicas,
reason = "自动伸缩: 当前副本=$currentReplicas, 期望副本=$desiredReplicas"
)
} catch (e: Exception) {
log.error("伸缩部署失败", e)
ScalingResult.error(
config = config,
error = "伸缩失败: ${e.message}"
)
}
}
}
- 故障恢复与自愈
6.1 自愈控制器
bash
class SelfHealingController(
private val k8sClient: KubernetesClient,
private val metricsClient: MetricsClient,
private val alertManager: AlertManager
) {
// 自愈规则定义
data class HealingRule(
val name: String, // 规则名称
val condition: HealingCondition, // 触发条件
val action: HealingAction, // 修复动作
val cooldownSeconds: Long = 300, // 冷却时间
val maxRetries: Int = 3 // 最大重试次数
)
sealed class HealingCondition {
data class PodCrashing(
val restartCountThreshold: Int = 3,
val timeWindowSeconds: Long = 300
) : HealingCondition()
data class HighResourceUsage(
val resourceType: ResourceType,
val threshold: Double,
val durationSeconds: Long = 300
) : HealingCondition()
data class HealthCheckFailed(
val consecutiveFailures: Int = 3,
val timeWindowSeconds: Long = 60
) : HealingCondition()
data class NetworkIssue(
val errorRateThreshold: Double = 0.1,
val durationSeconds: Long = 60
) : HealingCondition()
data class CustomMetric(
val metricName: String,
val threshold: Double,
val operator: Operator,
val durationSeconds: Long = 300
) : HealingCondition()
}
sealed class HealingAction {
data class RestartPod(
val podName: String? = null, // 指定Pod,为null时重启所有不健康的Pod
val gracefulPeriodSeconds: Long = 30
) : HealingAction()
data class ReschedulePod(
val podName: String,
val nodeSelector: Map<String, String> = emptyMap()
) : HealingAction()
data class ScaleOut(
val additionalReplicas: Int = 1
) : HealingAction()
data class UpdateConfig(
val configMapName: String,
val configData: Map<String, String>
) : HealingAction()
data class RollbackDeployment(
val revision: Int? = null // 回滚到指定版本,null表示回滚到上一个版本
) : HealingAction()
data class ExecuteScript(
val script: String,
val args: List<String> = emptyList()
) : HealingAction()
data class SendNotification(
val channels: List<NotificationChannel>,
val message: String
) : HealingAction()
}
// 预定义的自愈规则
private val predefinedRules = listOf(
// Pod频繁重启
HealingRule(
name = "pod-crashing",
condition = HealingCondition.PodCrashing(
restartCountThreshold = 3,
timeWindowSeconds = 300
),
action = HealingAction.RestartPod(),
cooldownSeconds = 300
),
// CPU使用率过高
HealingRule(
name = "high-cpu-usage",
condition = HealingCondition.HighResourceUsage(
resourceType = ResourceType.CPU,
threshold = 90.0,
durationSeconds = 300
),
action = HealingAction.ScaleOut(additionalReplicas = 1),
cooldownSeconds = 300
),
// 内存使用率过高
HealingRule(
name = "high-memory-usage",
condition = HealingCondition.HighResourceUsage(
resourceType = ResourceType.MEMORY,
threshold = 90.0,
durationSeconds = 300
),
action = HealingAction.ScaleOut(additionalReplicas = 1),
cooldownSeconds = 300
),
// 健康检查失败
HealingRule(
name = "health-check-failed",
condition = HealingCondition.HealthCheckFailed(
consecutiveFailures = 3,
timeWindowSeconds = 60
),
action = HealingAction.RestartPod(),
cooldownSeconds = 300
),
// 网络错误率过高
HealingRule(
name = "network-error-rate",
condition = HealingCondition.NetworkIssue(
errorRateThreshold = 0.1,
durationSeconds = 60
),
action = HealingAction.ExecuteScript(
script = "fix_network.sh",
args = listOf("--restart-service")
),
cooldownSeconds = 60
)
)
// 运行自愈控制器
suspend fun run() {
while (true) {
try {
// 1. 收集系统状态
val systemStatus = collectSystemStatus()
// 2. 检查自愈规则
val healingActions = checkHealingRules(systemStatus)
// 3. 执行修复动作
healingActions.forEach { action ->
executeHealingAction(action)
}
// 4. 等待下一次检查
delay(checkIntervalMillis)
} catch (e: Exception) {
log.error("自愈控制器运行异常", e)
delay(errorRetryDelayMillis)
}
}
}
// 收集系统状态
private suspend fun collectSystemStatus(): SystemStatus {
val namespaces = k8sClient.namespaces().list().items.map { it.metadata.name }
val deployments = mutableListOf<DeploymentStatus>()
val pods = mutableListOf<PodStatus>()
val nodes = mutableListOf<NodeStatus>()
val alerts = alertManager.getActiveAlerts()
// 收集部署状态
namespaces.forEach { namespace ->
val namespaceDeployments = k8sClient.apps().deployments()
.inNamespace(namespace)
.list().items
deployments.addAll(namespaceDeployments.map { deployment ->
DeploymentStatus(
name = deployment.metadata.name,
namespace = namespace,
replicas = deployment.spec.replicas,
readyReplicas = deployment.status?.readyReplicas ?: 0,
availableReplicas = deployment.status?.availableReplicas ?: 0,
conditions = deployment.status?.conditions?.map { condition ->
DeploymentCondition(
type = condition.type,
status = condition.status,
lastUpdateTime = condition.lastUpdateTime?.time ?: 0
)
} ?: emptyList()
)
})
// 收集Pod状态
val namespacePods = k8sClient.pods()
.inNamespace(namespace)
.list().items
pods.addAll(namespacePods.map { pod ->
PodStatus(
name = pod.metadata.name,
namespace = namespace,
nodeName = pod.spec.nodeName,
phase = pod.status.phase,
restartCount = pod.status.containerStatuses?.sumOf { it.restartCount } ?: 0,
conditions = pod.status.conditions?.map { condition ->
PodCondition(
type = condition.type,
status = condition.status,
message = condition.message
)
} ?: emptyList(),
resourceUsage = getPodResourceUsage(namespace, pod.metadata.name)
)
})
}
// 收集节点状态
val k8sNodes = k8sClient.nodes().list().items
nodes.addAll(k8sNodes.map { node ->
NodeStatus(
name = node.metadata.name,
conditions = node.status.conditions?.map { condition ->
NodeCondition(
type = condition.type,
status = condition.status,
message = condition.message
)
} ?: emptyList(),
capacity = node.status.capacity?.let { resources ->
NodeCapacity(
cpu = resources["cpu"]?.amount?.toDouble() ?: 0.0,
memory = resources["memory"]?.amount?.toDouble() ?: 0.0,
pods = resources["pods"]?.amount?.toDouble() ?: 0.0
)
} ?: NodeCapacity(),
allocatable = node.status.allocatable?.let { resources ->
NodeCapacity(
cpu = resources["cpu"]?.amount?.toDouble() ?: 0.0,
memory = resources["memory"]?.amount?.toDouble() ?: 0.0,
pods = resources["pods"]?.amount?.toDouble() ?: 0.0
)
} ?: NodeCapacity()
)
})
return SystemStatus(
deployments = deployments,
pods = pods,
nodes = nodes,
alerts = alerts,
timestamp = System.currentTimeMillis()
)
}
// 检查自愈规则
private fun checkHealingRules(status: SystemStatus): List<HealingAction> {
val actions = mutableListOf<HealingAction>()
// 检查Pod重启次数
status.pods.filter { it.restartCount > 0 }.forEach { pod ->
val rule = predefinedRules.find { it.name == "pod-crashing" }
rule?.let {
val condition = rule.condition as? HealingCondition.PodCrashing
if (condition != null && pod.restartCount >= condition.restartCountThreshold) {
actions.add(rule.action)
}
}
}
// 检查CPU使用率
status.deployments.forEach { deployment ->
val cpuUsage = getDeploymentCPUUsage(deployment.namespace, deployment.name)
if (cpuUsage > 90.0) {
val rule = predefinedRules.find { it.name == "high-cpu-usage" }
rule?.let { actions.add(rule.action) }
}
}
// 检查内存使用率
status.deployments.forEach { deployment ->
val memoryUsage = getDeploymentMemoryUsage(deployment.namespace, deployment.name)
if (memoryUsage > 90.0) {
val rule = predefinedRules.find { it.name == "high-memory-usage" }
rule?.let { actions.add(rule.action) }
}
}
// 检查健康检查
status.deployments.forEach { deployment ->
val unhealthyCondition = deployment.conditions.find {
it.type == "Available" && it.status == "False"
}
if (unhealthyCondition != null) {
val rule = predefinedRules.find { it.name == "health-check-failed" }
rule?.let { actions.add(rule.action) }
}
}
// 检查网络错误
val networkErrorRate = getNetworkErrorRate()
if (networkErrorRate > 0.1) {
val rule = predefinedRules.find { it.name == "network-error-rate" }
rule?.let { actions.add(rule.action) }
}
return actions
}
// 执行修复动作
private suspend fun executeHealingAction(action: HealingAction) {
when (action) {
is HealingAction.RestartPod -> restartPod(action)
is HealingAction.ReschedulePod -> reschedulePod(action)
is HealingAction.ScaleOut -> scaleOut(action)
is HealingAction.UpdateConfig -> updateConfig(action)
is HealingAction.RollbackDeployment -> rollbackDeployment(action)
is HealingAction.ExecuteScript -> executeScript(action)
is HealingAction.SendNotification -> sendNotification(action)
}
}
// 重启Pod
private suspend fun restartPod(action: HealingAction.RestartPod) {
try {
if (action.podName != null) {
// 重启指定Pod
k8sClient.pods()
.inNamespace("default")
.withName(action.podName)
.delete()
log.info("已重启Pod: ${action.podName}")
} else {
// 重启所有不健康的Pod
val unhealthyPods = k8sClient.pods()
.inNamespace("default")
.list().items
.filter { pod ->
pod.status.phase != "Running" ||
pod.status.containerStatuses?.any { !it.ready } == true
}
unhealthyPods.forEach { pod ->
k8sClient.pods()
.inNamespace(pod.metadata.namespace)
.withName(pod.metadata.name)
.delete()
log.info("已重启不健康Pod: ${pod.metadata.name}")
}
}
} catch (e: Exception) {
log.error("重启Pod失败", e)
}
}
// 扩缩容
private suspend fun scaleOut(action: HealingAction.ScaleOut) {
try {
// 这里简化处理,实际应该根据具体的部署来扩缩容
val deployments = k8sClient.apps().deployments()
.inNamespace("default")
.list().items
deployments.forEach { deployment ->
val currentReplicas = deployment.spec.replicas
val newReplicas = currentReplicas + action.additionalReplicas
k8sClient.apps().deployments()
.inNamespace(deployment.metadata.namespace)
.withName(deployment.metadata.name)
.scale(newReplicas)
log.info("已扩展部署: ${deployment.metadata.name}, 从$currentReplicas到$newReplicas个副本")
}
} catch (e: Exception) {
log.error("扩缩容失败", e)
}
}
// 回滚部署
private suspend fun rollbackDeployment(action: HealingAction.RollbackDeployment) {
try {
val deployments = k8sClient.apps().deployments()
.inNamespace("default")
.list().items
deployments.forEach { deployment ->
k8sClient.apps().deployments()
.inNamespace(deployment.metadata.namespace)
.withName(deployment.metadata.name)
.rollback()
.toRevision(action.revision?.toLong())
.rollback()
log.info("已回滚部署: ${deployment.metadata.name} 到版本${action.revision ?: "上一个"}")
}
} catch (e: Exception) {
log.error("回滚部署失败", e)
}
}
}
- 成本优化与资源管理
7.1 成本优化器
bash
class CostOptimizer(
private val k8sClient: KubernetesClient,
private val cloudProvider: CloudProvider,
private val metricsClient: MetricsClient
) {
// 分析资源使用情况
suspend fun analyzeResourceUsage(): ResourceAnalysis {
val namespaces = k8sClient.namespaces().list().items.map { it.metadata.name }
val namespaceAnalyses = namespaces.map { namespace ->
analyzeNamespaceResources(namespace)
}
val totalCost = namespaceAnalyses.sumOf { it.estimatedCost }
val totalWaste = namespaceAnalyses.sumOf { it.wastedCost }
val totalSavings = namespaceAnalyses.sumOf { it.potentialSavings }
return ResourceAnalysis(
namespaceAnalyses = namespaceAnalyses,
totalCost = totalCost,
totalWaste = totalWaste,
totalSavings = totalSavings,
recommendations = generateRecommendations(namespaceAnalyses)
)
}
// 分析命名空间资源
private suspend fun analyzeNamespaceResources(namespace: String): NamespaceAnalysis {
val deployments = k8sClient.apps().deployments()
.inNamespace(namespace)
.list().items
val deploymentAnalyses = deployments.map { deployment ->
analyzeDeploymentResources(namespace, deployment.metadata.name)
}
val totalRequestedCPU = deploymentAnalyses.sumOf { it.requestedCPU }
val totalRequestedMemory = deploymentAnalyses.sumOf { it.requestedMemory }
val totalUsedCPU = deploymentAnalyses.sumOf { it.usedCPU }
val totalUsedMemory = deploymentAnalyses.sumOf { it.usedMemory }
val cpuUtilization = if (totalRequestedCPU > 0) {
totalUsedCPU / totalRequestedCPU * 100
} else 0.0
val memoryUtilization = if (totalRequestedMemory > 0) {
totalUsedMemory / totalRequestedMemory * 100
} else 0.0
val estimatedCost = calculateCost(totalRequestedCPU, totalRequestedMemory)
val wastedCost = calculateWastedCost(
totalRequestedCPU, totalUsedCPU,
totalRequestedMemory, totalUsedMemory
)
val potentialSavings = calculatePotentialSavings(
totalRequestedCPU, totalUsedCPU,
totalRequestedMemory, totalUsedMemory
)
return NamespaceAnalysis(
namespace = namespace,
deployments = deploymentAnalyses,
totalRequestedCPU = totalRequestedCPU,
totalRequestedMemory = totalRequestedMemory,
totalUsedCPU = totalUsedCPU,
totalUsedMemory = totalUsedMemory,
cpuUtilization = cpuUtilization,
memoryUtilization = memoryUtilization,
estimatedCost = estimatedCost,
wastedCost = wastedCost,
potentialSavings = potentialSavings
)
}
// 分析部署资源
private suspend fun analyzeDeploymentResources(
namespace: String,
deploymentName: String
): DeploymentAnalysis {
val deployment = k8sClient.apps().deployments()
.inNamespace(namespace)
.withName(deploymentName)
.get()
if (deployment == null) {
throw IllegalArgumentException("Deployment not found: $namespace/$deploymentName")
}
val pods = k8sClient.pods()
.inNamespace(namespace)
.withLabels(deployment.spec.selector.matchLabels)
.list().items
val podAnalyses = pods.map { pod ->
analyzePodResources(namespace, pod.metadata.name)
}
val totalRequestedCPU = podAnalyses.sumOf { it.requestedCPU }
val totalRequestedMemory = podAnalyses.sumOf { it.requestedMemory }
val totalUsedCPU = podAnalyses.sumOf { it.usedCPU }
val totalUsedMemory = podAnalyses.sumOf { it.usedMemory }
val cpuUtilization = if (totalRequestedCPU > 0) {
totalUsedCPU / totalRequestedCPU * 100
} else 0.0
val memoryUtilization = if (totalRequestedMemory > 0) {
totalUsedMemory / totalRequestedMemory * 100
} else 0.0
val recommendations = generateDeploymentRecommendations(
deploymentName = deploymentName,
requestedCPU = totalRequestedCPU,
requestedMemory = totalRequestedMemory,
usedCPU = totalUsedCPU,
usedMemory = totalUsedMemory,
replicas = deployment.spec.replicas
)
return DeploymentAnalysis(
name = deploymentName,
namespace = namespace,
pods = podAnalyses,
replicas = deployment.spec.replicas,
requestedCPU = totalRequestedCPU,
requestedMemory = totalRequestedMemory,
usedCPU = totalUsedCPU,
usedMemory = totalUsedMemory,
cpuUtilization = cpuUtilization,
memoryUtilization = memoryUtilization,
recommendations = recommendations
)
}
// 生成优化建议
private fun generateDeploymentRecommendations(
deploymentName: String,
requestedCPU: Double,
requestedMemory: Double,
usedCPU: Double,
usedMemory: Double,
replicas: Int
): List<OptimizationRecommendation> {
val recommendations = mutableListOf<OptimizationRecommendation>()
// CPU建议
if (requestedCPU > 0 && usedCPU / requestedCPU < 0.5) {
val suggestedCPU = max(usedCPU * 1.5, requestedCPU * 0.7)
recommendations.add(
OptimizationRecommendation(
type = RecommendationType.RESIZE,
resource = ResourceType.CPU,
currentValue = requestedCPU,
suggestedValue = suggestedCPU,
reason = "CPU使用率较低 (${(usedCPU / requestedCPU * 100).toInt()}%)",
estimatedSavings = (requestedCPU - suggestedCPU) * getCPUPrice() * 24 * 30
)
)
} else if (usedCPU / requestedCPU > 0.8) {
recommendations.add(
OptimizationRecommendation(
class DeploymentDashboard(
private val context: Context,
private val deployService: DeployService,
private val monitorService: MonitorService,
private val costOptimizer: CostOptimizer
) {
fun createDashboard(): View {
val scrollView = ScrollView(context)
val mainLayout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
// 标题
mainLayout.addView(createTitleView())
// 概览卡片
mainLayout.addView(createOverviewCard())
// 部署状态
mainLayout.addView(createDeploymentStatusView())
// 资源使用情况
mainLayout.addView(createResourceUsageView())
// 成本分析
mainLayout.addView(createCostAnalysisView())
// 优化建议
mainLayout.addView(createOptimizationView())
scrollView.addView(mainLayout)
return scrollView
}
private fun createTitleView(): View {
return TextView(context).apply {
text = "模型服务部署平台"
textSize = 24f
setTypeface(null, Typeface.BOLD)
setTextColor(Color.BLACK)
gravity = Gravity.CENTER
setPadding(0, 0, 0, 16)
}
}
private fun createOverviewCard(): View {
val card = CardView(context).apply {
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.MATCH_PARENT,
LinearLayout.LayoutParams.WRAP_CONTENT
).apply {
setMargins(0, 0, 0, 16)
}
cardElevation = 8f
radius = 8f
}
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
// 异步加载概览数据
GlobalScope.launch(Dispatchers.Main) {
try {
val overview = withContext(Dispatchers.IO) {
getDeploymentOverview()
}
updateOverviewCard(layout, overview)
} catch (e: Exception) {
Log.e("DeploymentDashboard", "加载概览数据失败", e)
}
}
card.addView(layout)
return card
}
private suspend fun getDeploymentOverview(): DeploymentOverview {
val deployments = deployService.getDeployments()
val metrics = monitorService.getClusterMetrics()
val costAnalysis = costOptimizer.analyzeResourceUsage()
return DeploymentOverview(
totalDeployments = deployments.size,
runningDeployments = deployments.count { it.status == DeploymentStatus.RUNNING },
failedDeployments = deployments.count { it.status == DeploymentStatus.FAILED },
totalPods = deployments.sumOf { it.replicas },
cpuUsage = metrics.cpuUsage,
memoryUsage = metrics.memoryUsage,
monthlyCost = costAnalysis.totalCost,
monthlySavings = costAnalysis.totalSavings
)
}
private fun updateOverviewCard(layout: LinearLayout, overview: DeploymentOverview) {
layout.removeAllViews()
val title = TextView(context).apply {
text = "系统概览"
textSize = 18f
setTypeface(null, Typeface.BOLD)
setPadding(0, 0, 0, 8)
}
layout.addView(title)
val gridLayout = GridLayout(context).apply {
columnCount = 2
}
val overviewItems = listOf(
"总部署数" to overview.totalDeployments.toString(),
"运行中" to overview.runningDeployments.toString(),
"失败" to overview.failedDeployments.toString(),
"总Pod数" to overview.totalPods.toString(),
"CPU使用率" to "${overview.cpuUsage.toInt()}%",
"内存使用率" to "${overview.memoryUsage.toInt()}%",
"月度成本" to "$${overview.monthlyCost.toInt()}",
"潜在节省" to "$${overview.monthlySavings.toInt()}"
)
overviewItems.forEach { (label, value) ->
// 标签
val labelView = TextView(context).apply {
text = label
textSize = 14f
setPadding(8, 4, 8, 4)
}
gridLayout.addView(labelView)
// 值
val valueView = TextView(context).apply {
text = value
textSize = 14f
setTypeface(null, Typeface.BOLD)
setPadding(8, 4, 8, 4)
}
gridLayout.addView(valueView)
}
layout.addView(gridLayout)
}
private fun createDeploymentStatusView(): View {
val card = CardView(context).apply {
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.MATCH_PARENT,
LinearLayout.LayoutParams.WRAP_CONTENT
).apply {
setMargins(0, 0, 0, 16)
}
cardElevation = 8f
radius = 8f
}
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
val title = TextView(context).apply {
text = "部署状态"
textSize = 18f
setTypeface(null, Typeface.BOLD)
setPadding(0, 0, 0, 8)
}
layout.addView(title)
// 部署列表
val deploymentsView = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
}
layout.addView(deploymentsView)
// 异步加载部署数据
GlobalScope.launch(Dispatchers.Main) {
try {
val deployments = withContext(Dispatchers.IO) {
deployService.getDeployments()
}
updateDeploymentsView(deploymentsView, deployments)
} catch (e: Exception) {
Log.e("DeploymentDashboard", "加载部署数据失败", e)
}
}
card.addView(layout)
return card
}
private fun updateDeploymentsView(layout: LinearLayout, deployments: List<Deployment>) {
layout.removeAllViews()
deployments.forEach { deployment ->
val deploymentView = createDeploymentItemView(deployment)
layout.addView(deploymentView)
}
}
private fun createDeploymentItemView(deployment: Deployment): View {
val layout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
setPadding(0, 8, 0, 8)
background = createBackgroundDrawable()
}
// 状态指示器
val statusIndicator = View(context).apply {
layoutParams = LinearLayout.LayoutParams(8, LinearLayout.LayoutParams.MATCH_PARENT)
background = createStatusColor(deployment.status)
}
layout.addView(statusIndicator)
// 部署信息
val infoLayout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
layoutParams = LinearLayout.LayoutParams(
0,
LinearLayout.LayoutParams.WRAP_CONTENT,
1f
).apply {
setMargins(8, 0, 0, 0)
}
}
// 名称和状态
val nameLayout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
}
val nameView = TextView(context).apply {
text = deployment.name
textSize = 16f
setTypeface(null, Typeface.BOLD)
layoutParams = LinearLayout.LayoutParams(
0,
LinearLayout.LayoutParams.WRAP_CONTENT,
1f
)
}
nameLayout.addView(nameView)
val statusView = TextView(context).apply {
text = deployment.status.name
textSize = 14f
setTextColor(getStatusTextColor(deployment.status))
}
nameLayout.addView(statusView)
infoLayout.addView(nameLayout)
// 详细信息
val detailsView = TextView(context).apply {
text = "版本: ${deployment.version} | Pods: ${deployment.readyReplicas}/${deployment.replicas} | 运行时间: ${formatDuration(deployment.uptime)}"
textSize = 12f
setTextColor(Color.GRAY)
}
infoLayout.addView(detailsView)
layout.addView(infoLayout)
// 操作按钮
val actionsLayout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
}
if (deployment.status == DeploymentStatus.RUNNING) {
val restartButton = Button(context).apply {
text = "重启"
setOnClickListener {
restartDeployment(deployment)
}
}
actionsLayout.addView(restartButton)
val scaleButton = Button(context).apply {
text = "伸缩"
setOnClickListener {
showScaleDialog(deployment)
}
}
actionsLayout.addView(scaleButton)
}
val logsButton = Button(context).apply {
text = "日志"
setOnClickListener {
showDeploymentLogs(deployment)
}
}
actionsLayout.addView(logsButton)
layout.addView(actionsLayout)
return layout
}
private fun createResourceUsageView(): View {
// 创建资源使用情况图表
val card = CardView(context).apply {
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.MATCH_PARENT,
LinearLayout.LayoutParams.WRAP_CONTENT
).apply {
setMargins(0, 0, 0, 16)
}
cardElevation = 8f
radius = 8f
}
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
val title = TextView(context).apply {
text = "资源使用情况"
textSize = 18f
setTypeface(null, Typeface.BOLD)
setPadding(0, 0, 0, 8)
}
layout.addView(title)
// 这里可以集成图表库,如MPAndroidChart
// 简化实现,使用进度条展示
val cpuProgress = createProgressBar("CPU使用率", 65)
layout.addView(cpuProgress)
val memoryProgress = createProgressBar("内存使用率", 45)
layout.addView(memoryProgress)
val storageProgress = createProgressBar("存储使用率", 30)
layout.addView(storageProgress)
card.addView(layout)
return card
}
private fun createProgressBar(label: String, progress: Int): View {
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(0, 8, 0, 8)
}
val labelLayout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
}
val labelView = TextView(context).apply {
text = label
textSize = 14f
layoutParams = LinearLayout.LayoutParams(
0,
LinearLayout.LayoutParams.WRAP_CONTENT,
1f
)
}
labelLayout.addView(labelView)
val percentView = TextView(context).apply {
text = "$progress%"
textSize = 14f
}
labelLayout.addView(percentView)
layout.addView(labelLayout)
val progressBar = ProgressBar(context, null, android.R.attr.progressBarStyleHorizontal).apply {
this.progress = progress
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.MATCH_PARENT,
dipToPx(8)
)
}
layout.addView(progressBar)
return layout
}
// 辅助方法
private fun createBackgroundDrawable(): Drawable {
val drawable = GradientDrawable()
drawable.setColor(Color.WHITE)
drawable.cornerRadius = 4f
drawable.setStroke(1, Color.LTGRAY)
return drawable
}
private fun createStatusColor(status: DeploymentStatus): ColorDrawable {
return when (status) {
DeploymentStatus.RUNNING -> ColorDrawable(Color.GREEN)
DeploymentStatus.FAILED -> ColorDrawable(Color.RED)
DeploymentStatus.PENDING -> ColorDrawable(Color.YELLOW)
DeploymentStatus.STOPPED -> ColorDrawable(Color.GRAY)
else -> ColorDrawable(Color.LTGRAY)
}
}
private fun getStatusTextColor(status: DeploymentStatus): Int {
return when (status) {
DeploymentStatus.RUNNING -> Color.GREEN
DeploymentStatus.FAILED -> Color.RED
DeploymentStatus.PENDING -> Color.YELLOW
DeploymentStatus.STOPPED -> Color.GRAY
else -> Color.DKGRAY
}
}
private fun formatDuration(millis: Long): String {
val seconds = millis / 1000
val hours = seconds / 3600
val minutes = (seconds % 3600) / 60
return "${hours}h${minutes}m"
}
private fun dipToPx(dip: Int): Int {
return (dip * context.resources.displayMetrics.density).toInt()
}
private fun restartDeployment(deployment: Deployment) {
GlobalScope.launch(Dispatchers.IO) {
try {
deployService.restartDeployment(deployment.namespace, deployment.name)
withContext(Dispatchers.Main) {
Toast.makeText(context, "重启部署成功", Toast.LENGTH_SHORT).show()
}
} catch (e: Exception) {
withContext(Dispatchers.Main) {
Toast.makeText(context, "重启失败: ${e.message}", Toast.LENGTH_SHORT).show()
}
}
}
}
private fun showScaleDialog(deployment: Deployment) {
val dialog = AlertDialog.Builder(context)
.setTitle("伸缩部署")
.setMessage("调整 ${deployment.name} 的副本数")
.setView(createScaleDialogView(deployment))
.setPositiveButton("确定") { dialog, _ ->
// 处理伸缩逻辑
dialog.dismiss()
}
.setNegativeButton("取消") { dialog, _ ->
dialog.dismiss()
}
.create()
dialog.show()
}
private fun createScaleDialogView(deployment: Deployment): View {
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
val currentReplicas = TextView(context).apply {
text = "当前副本数: ${deployment.replicas}"
textSize = 16f
setPadding(0, 0, 0, 16)
}
layout.addView(currentReplicas)
val inputLayout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
}
val label = TextView(context).apply {
text = "新副本数:"
textSize = 16f
setPadding(0, 0, 16, 0)
}
inputLayout.addView(label)
val editText = EditText(context).apply {
setText(deployment.replicas.toString())
inputType = InputType.TYPE_CLASS_NUMBER
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.WRAP_CONTENT,
LinearLayout.LayoutParams.WRAP_CONTENT
)
}
inputLayout.addView(editText)
layout.addView(inputLayout)
return layout
}
private fun showDeploymentLogs(deployment: Deployment) {
val intent = Intent(context, LogsActivity::class.java).apply {
putExtra("namespace", deployment.namespace)
putExtra("deployment", deployment.name)
}
context.startActivity(intent)
}
private fun createCostAnalysisView(): View {
val card = CardView(context).apply {
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.MATCH_PARENT,
LinearLayout.LayoutParams.WRAP_CONTENT
).apply {
setMargins(0, 0, 0, 16)
}
cardElevation = 8f
radius = 8f
}
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
val title = TextView(context).apply {
text = "成本分析"
textSize = 18f
setTypeface(null, Typeface.BOLD)
setPadding(0, 0, 0, 8)
}
layout.addView(title)
// 异步加载成本数据
GlobalScope.launch(Dispatchers.Main) {
try {
val costAnalysis = withContext(Dispatchers.IO) {
costOptimizer.analyzeResourceUsage()
}
updateCostAnalysisView(layout, costAnalysis)
} catch (e: Exception) {
Log.e("DeploymentDashboard", "加载成本数据失败", e)
}
}
card.addView(layout)
return card
}
private fun updateCostAnalysisView(layout: LinearLayout, costAnalysis: ResourceAnalysis) {
layout.removeAllViews()
val costItems = listOf(
"月度总成本" to "$${costAnalysis.totalCost.toInt()}",
"资源浪费" to "$${costAnalysis.totalWaste.toInt()}",
"潜在节省" to "$${costAnalysis.totalSavings.toInt()}"
)
costItems.forEach { (label, value) ->
val itemLayout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
setPadding(0, 4, 0, 4)
}
val labelView = TextView(context).apply {
text = label
textSize = 14f
layoutParams = LinearLayout.LayoutParams(
0,
LinearLayout.LayoutParams.WRAP_CONTENT,
1f
)
}
itemLayout.addView(labelView)
val valueView = TextView(context).apply {
text = value
textSize = 14f
setTypeface(null, Typeface.BOLD)
setTextColor(
when (label) {
"资源浪费" -> Color.RED
"潜在节省" -> Color.GREEN
else -> Color.BLACK
}
)
}
itemLayout.addView(valueView)
layout.addView(itemLayout)
}
// 成本趋势图
val chartTitle = TextView(context).apply {
text = "成本趋势"
textSize = 16f
setTypeface(null, Typeface.BOLD)
setPadding(0, 16, 0, 8)
}
layout.addView(chartTitle)
// 这里可以集成图表库显示成本趋势
val chartPlaceholder = TextView(context).apply {
text = "成本趋势图表将在这里显示"
textSize = 14f
setTextColor(Color.GRAY)
gravity = Gravity.CENTER
setPadding(0, 16, 0, 16)
}
layout.addView(chartPlaceholder)
}
private fun createOptimizationView(): View {
val card = CardView(context).apply {
layoutParams = LinearLayout.LayoutParams(
LinearLayout.LayoutParams.MATCH_PARENT,
LinearLayout.LayoutParams.WRAP_CONTENT
).apply {
setMargins(0, 0, 0, 16)
}
cardElevation = 8f
radius = 8f
}
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
val title = TextView(context).apply {
text = "优化建议"
textSize = 18f
setTypeface(null, Typeface.BOLD)
setPadding(0, 0, 0, 8)
}
layout.addView(title)
// 异步加载优化建议
GlobalScope.launch(Dispatchers.Main) {
try {
val recommendations = withContext(Dispatchers.IO) {
costOptimizer.analyzeResourceUsage().recommendations
}
updateOptimizationView(layout, recommendations)
} catch (e: Exception) {
Log.e("DeploymentDashboard", "加载优化建议失败", e)
}
}
card.addView(layout)
return card
}
private fun updateOptimizationView(layout: LinearLayout, recommendations: List<OptimizationRecommendation>) {
layout.removeAllViews()
if (recommendations.isEmpty()) {
val emptyView = TextView(context).apply {
text = "暂无优化建议"
textSize = 14f
setTextColor(Color.GRAY)
gravity = Gravity.CENTER
setPadding(0, 16, 0, 16)
}
layout.addView(emptyView)
return
}
recommendations.take(5).forEach { recommendation ->
val recommendationView = createRecommendationView(recommendation)
layout.addView(recommendationView)
}
if (recommendations.size > 5) {
val moreView = TextView(context).apply {
text = "还有${recommendations.size - 5}条建议..."
textSize = 14f
setTextColor(Color.BLUE)
gravity = Gravity.CENTER
setPadding(0, 8, 0, 0)
setOnClickListener {
showAllRecommendations(recommendations)
}
}
layout.addView(moreView)
}
}
private fun createRecommendationView(recommendation: OptimizationRecommendation): View {
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(0, 8, 0, 8)
background = createBackgroundDrawable()
setPadding(8, 8, 8, 8)
}
val titleLayout = LinearLayout(context).apply {
orientation = LinearLayout.HORIZONTAL
}
val titleView = TextView(context).apply {
text = getRecommendationTitle(recommendation)
textSize = 14f
setTypeface(null, Typeface.BOLD)
layoutParams = LinearLayout.LayoutParams(
0,
LinearLayout.LayoutParams.WRAP_CONTENT,
1f
)
}
titleLayout.addView(titleView)
val savingsView = TextView(context).apply {
text = "节省: $${recommendation.estimatedSavings.toInt()}"
textSize = 14f
setTextColor(Color.GREEN)
}
titleLayout.addView(savingsView)
layout.addView(titleLayout)
val reasonView = TextView(context).apply {
text = recommendation.reason
textSize = 12f
setTextColor(Color.GRAY)
setPadding(0, 4, 0, 0)
}
layout.addView(reasonView)
val actionButton = Button(context).apply {
text = "应用建议"
textSize = 12f
setOnClickListener {
applyOptimization(recommendation)
}
}
layout.addView(actionButton)
return layout
}
private fun getRecommendationTitle(recommendation: OptimizationRecommendation): String {
return when (recommendation.type) {
RecommendationType.RESIZE -> "${recommendation.resource}调整"
RecommendationType.SCALE_DOWN -> "缩容"
RecommendationType.SCALE_UP -> "扩容"
RecommendationType.NODE_SELECTION -> "节点优化"
RecommendationType.SCHEDULE -> "调度优化"
}
}
private fun applyOptimization(recommendation: OptimizationRecommendation) {
// 这里需要根据具体建议类型执行相应的优化操作
// 简化实现,只显示提示
Toast.makeText(context, "应用优化建议: ${recommendation.reason}", Toast.LENGTH_SHORT).show()
}
private fun showAllRecommendations(recommendations: List<OptimizationRecommendation>) {
val dialog = AlertDialog.Builder(context)
.setTitle("所有优化建议")
.setView(createRecommendationsListView(recommendations))
.setPositiveButton("关闭", null)
.create()
dialog.show()
}
private fun createRecommendationsListView(recommendations: List<OptimizationRecommendation>): View {
val scrollView = ScrollView(context)
val layout = LinearLayout(context).apply {
orientation = LinearLayout.VERTICAL
setPadding(16, 16, 16, 16)
}
recommendations.forEach { recommendation ->
val recommendationView = createRecommendationView(recommendation)
layout.addView(recommendationView)
}
scrollView.addView(layout)
return scrollView
}
}
// 数据类定义
data class DeploymentOverview(
val totalDeployments: Int,
val runningDeployments: Int,
val failedDeployments: Int,
val totalPods: Int,
val cpuUsage: Double,
val memoryUsage: Double,
val monthlyCost: Double,
val monthlySavings: Double
)
data class Deployment(
val name: String,
val namespace: String,
val version: String,
val status: DeploymentStatus,
val replicas: Int,
val readyReplicas: Int,
val uptime: Long,
val createdAt: Long
)
enum class DeploymentStatus {
RUNNING,
FAILED,
PENDING,
STOPPED,
UNKNOWN
}
- 部署平台配置管理
9.1 配置管理器
bash
class ConfigManager(
private val context: Context,
private val k8sClient: KubernetesClient
) {
// 配置管理
suspend fun createConfigMap(namespace: String, name: String, Map<String, String>): ConfigMap {
return withContext(Dispatchers.IO) {
val configMap = ConfigMapBuilder()
.withNewMetadata()
.withName(name)
.withNamespace(namespace)
.endMetadata()
.withData(data)
.build()
k8sClient.configMaps()
.inNamespace(namespace)
.createOrReplace(configMap)
configMap
}
}
suspend fun getConfigMap(namespace: String, name: String): ConfigMap? {
return withContext(Dispatchers.IO) {
k8sClient.configMaps()
.inNamespace(namespace)
.withName(name)
.get()
}
}
suspend fun updateConfigMap(namespace: String, name: String, data: Map<String, String>): ConfigMap {
return withContext(Dispatchers.IO) {
val configMap = k8sClient.configMaps()
.inNamespace(namespace)
.withName(name)
.get()
?: throw IllegalArgumentException("ConfigMap not found: $namespace/$name")
val updatedConfigMap = configMap.copy(data = data)
k8sClient.configMaps()
.inNamespace(namespace)
.createOrReplace(updatedConfigMap)
updatedConfigMap
}
}
// 密钥管理
suspend fun createSecret(namespace: String, name: String, Map<String, String>): Secret {
return withContext(Dispatchers.IO) {
val secretData = data.mapValues { (_, value) ->
Base64.getEncoder().encodeToString(value.toByteArray())
}
val secret = SecretBuilder()
.withNewMetadata()
.withName(name)
.withNamespace(namespace)
.endMetadata()
.withData(secretData)
.build()
k8sClient.secrets()
.inNamespace(namespace)
.createOrReplace(secret)
secret
}
}
// 配置版本管理
suspend fun createConfigVersion(
namespace: String,
configName: String,
version: String,
Map<String, String>
): ConfigVersion {
return withContext(Dispatchers.IO) {
val versionedConfigName = "$configName-$version"
val configMap = createConfigMap(namespace, versionedConfigName, data)
ConfigVersion(
name = configName,
version = version,
configMapName = versionedConfigName,
data = data,
createdAt = System.currentTimeMillis()
)
}
}
suspend fun rollbackConfig(namespace: String, configName: String, targetVersion: String): ConfigMap {
return withContext(Dispatchers.IO) {
val versionedConfigName = "$configName-$targetVersion"
val targetConfigMap = k8sClient.configMaps()
.inNamespace(namespace)
.withName(versionedConfigName)
.get()
?: throw IllegalArgumentException("Config version not found: $versionedConfigName")
// 更新当前配置
val currentConfigMap = k8sClient.configMaps()
.inNamespace(namespace)
.withName(configName)
.edit()
?.withData(targetConfigMap.data)
?.done()
?: throw IllegalStateException("Failed to update config map")
currentConfigMap
}
}
// 配置验证
suspend fun validateConfig Map<String, String>, schema: ConfigSchema): ValidationResult {
return withContext(Dispatchers.IO) {
val errors = mutableListOf<String>()
schema.fields.forEach { field ->
when (field.type) {
ConfigType.STRING -> validateStringField(data, field, errors)
ConfigType.NUMBER -> validateNumberField(data, field, errors)
ConfigType.BOOLEAN -> validateBooleanField(data, field, errors)
ConfigType.JSON -> validateJsonField(data, field, errors)
}
}
ValidationResult(
isValid = errors.isEmpty(),
errors = errors
)
}
}
private fun validateStringField(data: Map<String, String>, field: ConfigField, errors: MutableList<String>) {
val value = data[field.name]
if (value == null) {
if (field.required) {
errors.add("${field.name} 是必填字段")
}
return
}
if (field.minLength != null && value.length < field.minLength) {
errors.add("${field.name} 长度不能小于 ${field.minLength}")
}
if (field.maxLength != null && value.length > field.maxLength) {
errors.add("${field.name} 长度不能大于 ${field.maxLength}")
}
if (field.pattern != null && !value.matches(Regex(field.pattern))) {
errors.add("${field.name} 格式不正确")
}
}
private fun validateNumberField Map<String, String>, field: ConfigField, errors: MutableList<String>) {
val value = data[field.name] ?: return
try {
val number = value.toDouble()
if (field.minValue != null && number < field.minValue) {
errors.add("${field.name} 不能小于 ${field.minValue}")
}
if (field.maxValue != null && number > field.maxValue) {
errors.add("${field.name} 不能大于 ${field.maxValue}")
}
} catch (e: NumberFormatException) {
errors.add("${field.name} 必须是数字")
}
}
private fun validateBooleanField(data: Map<String, String>, field: ConfigField, errors: MutableList<String>) {
val value = data[field.name] ?: return
if (value != "true" && value != "false") {
errors.add("${field.name} 必须是 true 或 false")
}
}
private fun validateJsonField(data: Map<String, String>, field: ConfigField, errors: MutableList<String>) {
val value = data[field.name] ?: return
try {
Json.parseToJsonElement(value)
} catch (e: Exception) {
errors.add("${field.name} 必须是有效的 JSON")
}
}
}
// 配置相关数据类
data class ConfigSchema(
val name: String,
val description: String,
val fields: List<ConfigField>
)
data class ConfigField(
val name: String,
val type: ConfigType,
val description: String,
val required: Boolean = false,
val defaultValue: String? = null,
val minLength: Int? = null,
val maxLength: Int? = null,
val minValue: Double? = null,
val maxValue: Double? = null,
val pattern: String? = null,
val options: List<String>? = null
)
enum class ConfigType {
STRING, NUMBER, BOOLEAN, JSON
}
data class ConfigVersion(
val name: String,
val version: String,
val configMapName: String,
val data: Map<String, String>,
val createdAt: Long
)
data class ValidationResult(
val isValid: Boolean,
val errors: List<String>
)
- 部署平台安全
10.1 安全控制器
bash
class SecurityController(
private val k8sClient: KubernetesClient,
private val vaultClient: VaultClient
) {
// 安全策略
data class SecurityPolicy(
val name: String,
val namespace: String,
val rules: List<SecurityRule>,
val enforcementMode: EnforcementMode
)
data class SecurityRule(
val name: String,
val type: SecurityRuleType,
val selector: Map<String, String>,
val conditions: List<SecurityCondition>,
val action: SecurityAction
)
// 网络策略
suspend fun createNetworkPolicy(namespace: String, policy: NetworkPolicy): NetworkPolicy {
return withContext(Dispatchers.IO) {
k8sClient.network().v1()
.networkPolicies()
.inNamespace(namespace)
.createOrReplace(policy)
}
}
// Pod安全策略
suspend fun createPodSecurityPolicy(namespace: String, policy: PodSecurityPolicy): PodSecurityPolicy {
return withContext(Dispatchers.IO) {
k8sClient.policy().v1beta1()
.podSecurityPolicies()
.createOrReplace(policy)
}
}
// 安全扫描
suspend fun scanDeployment(namespace: String, deploymentName: String): SecurityScanResult {
return withContext(Dispatchers.IO) {
val deployment = k8sClient.apps().deployments()
.inNamespace(namespace)
.withName(deploymentName)
.get()
?: throw IllegalArgumentException("Deployment not found")
val vulnerabilities = scanForVulnerabilities(deployment)
val complianceIssues = checkCompliance(deployment)
val securityScore = calculateSecurityScore(vulnerabilities, complianceIssues)
SecurityScanResult(
deployment = deploymentName,
namespace = namespace,
vulnerabilities = vulnerabilities,
complianceIssues = complianceIssues,
securityScore = securityScore,
recommendations = generateSecurityRecommendations(vulnerabilities, complianceIssues)
)
}
}
private fun scanForVulnerabilities(deployment: Deployment): List<Vulnerability> {
val vulnerabilities = mutableListOf<Vulnerability>()
// 检查镜像漏洞
deployment.spec.template.spec.containers.forEach { container ->
val image = container.image
if (image.contains("latest")) {
vulnerabilities.add(
Vulnerability(
type = VulnerabilityType.IMAGE_TAG,
severity = VulnerabilitySeverity.MEDIUM,
description = "使用latest标签: $image",
affectedResource = "container:${container.name}",
recommendation = "使用特定版本标签"
)
)
}
if (container.securityContext == null) {
vulnerabilities.add(
Vulnerability(
type = VulnerabilityType.SECURITY_CONTEXT,
severity = VulnerabilitySeverity.HIGH,
description = "容器未设置安全上下文",
affectedResource = "container:${container.name}",
recommendation = "设置安全上下文,限制权限"
)
)
}
}
// 检查权限
if (deployment.spec.template.spec.securityContext == null) {
vulnerabilities.add(
Vulnerability(
type = VulnerabilityType.POD_SECURITY,
severity = VulnerabilitySeverity.MEDIUM,
description = "Pod未设置安全上下文",
affectedResource = "pod",
recommendation = "设置Pod安全上下文"
)
)
}
return vulnerabilities
}
private fun checkCompliance(deployment: Deployment): List<ComplianceIssue> {
val issues = mutableListOf<ComplianceIssue>()
// 检查合规性要求
deployment.spec.template.spec.containers.forEach { container ->
// 检查资源限制
if (container.resources?.limits == null) {
issues.add(
ComplianceIssue(
type = ComplianceType.RESOURCE_LIMITS,
standard = "Kubernetes Best Practices",
description = "容器未设置资源限制",
affectedResource = "container:${container.name}",
requirement = "必须设置CPU和内存限制"
)
)
}
// 检查镜像拉取策略
if (container.imagePullPolicy != "IfNotPresent") {
issues.add(
ComplianceIssue(
type = ComplianceType.IMAGE_PULL,
standard = "Security Best Practices",
description = "镜像拉取策略不安全",
affectedResource = "container:${container.name}",
requirement = "建议使用IfNotPresent策略"
)
)
}
}
return issues
}
private fun calculateSecurityScore(vulnerabilities: List<Vulnerability>, issues: List<ComplianceIssue>): Double {
val totalWeight = vulnerabilities.sumOf { it.severity.weight } + issues.size * 0.5
val maxScore = 100.0
val score = maxScore - totalWeight.coerceAtMost(maxScore)
return score.coerceIn(0.0, maxScore)
}
private fun generateSecurityRecommendations(
vulnerabilities: List<Vulnerability>,
issues: List<ComplianceIssue>
): List<SecurityRecommendation> {
val recommendations = mutableListOf<SecurityRecommendation>()
vulnerabilities.forEach { vuln ->
recommendations.add(
SecurityRecommendation(
type = RecommendationType.VULNERABILITY_FIX,
priority = vuln.severity.priority,
description = vuln.recommendation,
action = generateFixAction(vuln)
)
)
}
issues.forEach { issue ->
recommendations.add(
SecurityRecommendation(
type = RecommendationType.COMPLIANCE,
priority = Priority.MEDIUM,
description = issue.requirement,
action = generateComplianceAction(issue)
)
)
}
return recommendations.sortedByDescending { it.priority.ordinal }
}
// 密钥管理
suspend fun storeSecret(key: String, value: String, namespace: String): Boolean {
return withContext(Dispatchers.IO) {
try {
vaultClient.storeSecret("kubernetes/$namespace", key, value)
true
} catch (e: Exception) {
log.error("存储密钥失败", e)
false
}
}
}
suspend fun getSecret(key: String, namespace: String): String? {
return withContext(Dispatchers.IO) {
try {
vaultClient.getSecret("kubernetes/$namespace", key)
} catch (e: Exception) {
log.error("获取密钥失败", e)
null
}
}
}
}
// 安全相关数据类
data class SecurityScanResult(
val deployment: String,
val namespace: String,
val vulnerabilities: List<Vulnerability>,
val complianceIssues: List<ComplianceIssue>,
val securityScore: Double,
val recommendations: List<SecurityRecommendation>
)
data class Vulnerability(
val type: VulnerabilityType,
val severity: VulnerabilitySeverity,
val description: String,
val affectedResource: String,
val recommendation: String
)
enum class VulnerabilityType {
IMAGE_TAG,
SECURITY_CONTEXT,
POD_SECURITY,
NETWORK_POLICY,
PRIVILEGE_ESCALATION
}
enum class VulnerabilitySeverity(val weight: Double) {
LOW(1.0),
MEDIUM(2.0),
HIGH(3.0),
CRITICAL(5.0)
}
data class ComplianceIssue(
val type: ComplianceType,
val standard: String,
val description: String,
val affectedResource: String,
val requirement: String
)
enum class ComplianceType {
RESOURCE_LIMITS,
IMAGE_PULL,
NETWORK_ACCESS,
VOLUME_MOUNT,
SERVICE_ACCOUNT
}
data class SecurityRecommendation(
val type: RecommendationType,
val priority: Priority,
val description: String,
val action: String
)
enum class RecommendationType {
VULNERABILITY_FIX,
COMPLIANCE,
BEST_PRACTICE
}
enum class Priority {
LOW, MEDIUM, HIGH, CRITICAL
}
总结
本文详细介绍了模型服务部署与运维平台的完整设计与实现,包括:
核心功能模块
持续集成/持续部署:自动化构建、测试、部署流程
容器编排:基于Kubernetes的部署管理
监控告警:全面的指标收集、监控和告警系统
自动扩缩容:基于指标的智能扩缩容策略
故障恢复:自愈机制和自动恢复能力
成本优化:资源使用分析和成本优化建议
**安全管理