这次分享主要聚焦于 OpenKruise 的 kruise-daemon
组件,分享与该组件的源码实现与其功能相关的 CRD。
Kruise-Daemon
简介
由上图可以看到:kruise-daemon 是部署在 Slave 节点的组件,其职能与 kubelet 类似,都是负责管理容器的生命周期,从 OpenKruise 的职能定位上来看,kruise-daemon 起到了增强 kubelet 的作用。
在本次分享中,将会对使用到 kruise-daemon 的 ContainerRecreateRequest CRD 两者进行解析。
架构
-
queue:每个控制器的 queue 是延时队列,同时可以限制每个 item 的入队次数,在实现上是模拟了 Kubernetes 的 Reconcile 流程。
- 限制每个 item 的入队次数:队列内部会记录每个 item 的失败次数。当某个 item 失败时,再次入队后,延时时间会翻倍。当失败次数较大时,延时时间会非常大,导致此 item 不会被排在队头,也就不会被取出。
启动流程源码解析
整体流程
go
func main() {
// ...
cfg := config.GetConfigOrDie()
// 指定daemon
cfg.UserAgent = "kruise-daemon"
// ...
go func() {
// 监听10222端口,启动http服务
if err := http.ListenAndServe(*pprofAddr, nil); err != nil {
klog.Fatal(err, "unable to start pprof")
}
}()
// 创建信号处理器
ctx := signals.SetupSignalHandler()
// 创建daemon
d, err := daemon.NewDaemon(cfg, *bindAddr)
if err != nil {
klog.Fatalf("Failed to new daemon: %v", err)
}
// 运行daemon
if err := d.Run(ctx); err != nil {
klog.Fatalf("Failed to start daemon: %v", err)
}
}
// 运行daemon
func (d *daemon) Run(ctx context.Context) error {
// 如果配置了podInformer,就运行
if d.podInformer != nil {
go d.podInformer.Run(ctx.Done())
if !cache.WaitForCacheSync(ctx.Done(), d.podInformer.HasSynced) {
return fmt.Errorf("error waiting for pod informer synced")
}
}
// 运行健康监测服务,并上报Metrics
go d.serve(ctx)
// 运行Runnables的各控制器
for _, r := range d.runnables {
go r.Run(ctx.Done())
}
// 监听错误信号
select {
case <-ctx.Done():
// We are done
return nil
case <-d.errSignal.GotError():
// Error starting a controller
return d.errSignal.Error()
}
}
创建 Daemon 流程
go
// NewDaemon create a daemon
func NewDaemon(cfg *rest.Config, bindAddress string) (Daemon, error) {
// 保证配置不为空
if cfg == nil {
return nil, fmt.Errorf("cfg can not be nil")
}
// 获得nodeName
nodeName, err := daemonutil.NodeName()
if err != nil {
return nil, err
}
klog.Infof("Starting daemon on %v ...", nodeName)
// 监听10221端口,启动tcp服务,用于进行健康监测和metrics采集
listener, err := net.Listen("tcp", bindAddress)
if err != nil {
return nil, fmt.Errorf("new listener error: %v", err)
}
// 初始化健康监测服务
healthz := daemonutil.NewHealthz()
// 初始化runtimeClient
runtimeClient, err := runtimeclient.New(cfg, runtimeclient.Options{Scheme: scheme})
if err != nil {
return nil, fmt.Errorf("failed to new controller-runtime client: %v", err)
}
// ...
// 初始化podInformer
var podInformer cache.SharedIndexInformer
if utilfeature.DefaultFeatureGate.Enabled(features.DaemonWatchingPod) {
podInformer = newPodInformer(genericClient.KubeClient, nodeName)
}
// 初始化accountManager
accountManager := daemonutil.NewImagePullAccountManager(genericClient.KubeClient)
// 初始化runtimeFactory
runtimeFactory, err := daemonruntime.NewFactory(varRunMountPath, accountManager)
if err != nil {
return nil, fmt.Errorf("failed to new runtime factory: %v", err)
}
// 初始化secretManager
secretManager := daemonutil.NewCacheBasedSecretManager(genericClient.KubeClient)
// 将初始化好的资源进行整合,作为参数传给控制器
opts := daemonoptions.Options{
NodeName: nodeName,
Scheme: scheme,
RuntimeClient: runtimeClient,
PodInformer: podInformer,
RuntimeFactory: runtimeFactory,
Healthz: healthz,
}
// 创建ImagePuller控制器
puller, err := imagepuller.NewController(opts, secretManager)
if err != nil {
return nil, fmt.Errorf("failed to new image puller controller: %v", err)
}
// 创建CRR控制器
crrController, err := containerrecreate.NewController(opts)
if err != nil {
return nil, fmt.Errorf("failed to new crr daemon controller: %v", err)
}
// 将runnable进行整合
var runnables = []Runnable{
puller,
crrController,
}
// ...
// 构造daemon并返回
return &daemon{
runtimeFactory: runtimeFactory,
podInformer: podInformer,
runnables: runnables,
listener: listener,
healthz: healthz,
errSignal: &errSignaler{errSignal: make(chan struct{})},
}, nil
}
ContainerRecreateRequest
简介
ContainerRecreateRequest 简称 CRR,用于重启/重建存量 Pod 中一个或多个容器。当一个容器重建的时候,Pod 中的其他容器还保持正常运行。
使用 ContainerRecreateRequest 带来的好处:
- 重建容器,原来的方法是对 Pod 进行重建,该 Pod 内的存量容器都会被重建。使用 CRR 可以做到只重建单个容器,而存量容器不受影响;
- 对 Pod 进行重建需要经过分配资源、设置元数据、调度等一系列过程。重建某个容器不需要重新调度 Pod;
来看看官方给的 yaml 模版:
yaml
apiVersion: apps.kruise.io/v1alpha1
kind: ContainerRecreateRequest
metadata:
namespace: pod-namespace
name: xxx
spec:
podName: pod-name
containers:
- name: app
- name: sidecar
strategy:
# 若容器已经启动成功超过 minStartedSeconds 时间后,则判定重启成功,否则重启失败;
# 如果重启失败,则会不断尝试重启;
# 若设置为 Fail,失败一次后就不会再次尝试;
# 若设置为 Ignore,则会一直尝试。
failurePolicy: Fail
# 按顺序重建
orderedRecreate: false
# 开启后,要重启的容器对应的 pod 会先设置为 notReady,
# 然后等待 unreadyGracePeriodSeconds 时间,才开始重启容器,
# 这样可以避免执行到一半的请求由于容器重启而返回错误,
# 使得服务关闭更加平滑。
unreadyGracePeriodSeconds: 3
# 容器若能够持续运行指定时间,则视为重启成功;
# 若容器设置了 liveness-probe / readiness-probe,其判定优先级为,
# 若容器探针探测成功,那么才开始判断容器持续运行时间是否超过 MinStartedSeconds;
# 若容器存活探测失败,则会尝试重启。
minStartedSeconds: 10
# CRR 如果在 activeDeadlineSeconds 时间后还没有结果,
# 此结果表现为重启成功/重启失败,则将CRR设置为 Fail 状态。
activeDeadlineSeconds: 300
# CRR 结束后,让其存活 tTLSecondsAfterFinished 时间,再将其删除,
# 结束表现为 CRR 重启成功/重启失败。
ttlSecondsAfterFinished: 1800
status:
# 记录CRR重建的完成情况,以容器为单位
containerRecreateStates:
- name: app
phase: Succeeded
# 记录CRR整体重建容器的状态
phase: Completed
# ...
工作原理
其大体的实现可以归纳为以下几点:
- 进入 CRR CRD 的 Reconcile 逻辑;
- 在 Reconcile 中,将 Spec 中的 Pod、Container 信息写入 Status;
- kruise-daemon 定期拉取 CRR,从 Status 中获取 Pod、Container 信息;
- 根据 Pod、Container 信息主动 kill 容器;
- kubelet 会对容器的生命周期进行管理,当感知到容器退出时,尝试重新拉起;
源码解析
Reconcile
go
func (r *ReconcileContainerRecreateRequest) Reconcile(_ context.Context, request reconcile.Request) (res reconcile.Result, err error) {
// ...
// 获取CRR
crr := &appsv1alpha1.ContainerRecreateRequest{}
err = r.Get(context.TODO(), request.NamespacedName, crr)
if err != nil {
if errors.IsNotFound(err) {
return reconcile.Result{}, nil
}
return reconcile.Result{}, err
}
// 获取Pod
pod := &v1.Pod{}
podErr := r.Get(context.TODO(), types.NamespacedName{Namespace: crr.Namespace, Name: crr.Spec.PodName}, pod)
if podErr != nil && !errors.IsNotFound(podErr) {
return reconcile.Result{}, fmt.Errorf("failed to get Pod for CRR: %v", podErr)
}
// 判断CRR是否Active
// 出现了DeletionTimestamp,说明CRR要被删除了
// 出现了CompletionTime,说明CRR已经执行完毕
if crr.DeletionTimestamp != nil || crr.Status.CompletionTime != nil {
// 如果设置了平滑关闭,会对pod注入readiness probe,这里crr已经要结束了,所以需要将readiness probe删除
if slice.ContainsString(crr.Finalizers, appsv1alpha1.ContainerRecreateRequestUnreadyAcquiredKey, nil) {
if err := r.releasePodNotReady(crr, pod); err != nil {
return reconcile.Result{}, err
}
}
// crr要被删除了
if crr.DeletionTimestamp != nil {
return reconcile.Result{}, nil
}
// 这个label代表crr处于active状态,如果crr不active了,要把这个label删除
if _, ok := crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey]; ok {
body := fmt.Sprintf(`{"metadata":{"labels":{"%s":null}}}`, appsv1alpha1.ContainerRecreateRequestActiveKey)
return reconcile.Result{}, r.Patch(context.TODO(), crr, client.RawPatch(types.MergePatchType, []byte(body)))
}
var leftTime time.Duration
// crr设置了ttl时间
if crr.Spec.TTLSecondsAfterFinished != nil {
leftTime = time.Duration(*crr.Spec.TTLSecondsAfterFinished)*time.Second - time.Since(crr.Status.CompletionTime.Time)
// ttl时间已经过了
if leftTime <= 0 {
klog.Infof("Deleting CRR %s/%s for ttlSecondsAfterFinished", crr.Namespace, crr.Name)
// 直接删除crr
if err = r.Delete(context.TODO(), crr); err != nil {
return reconcile.Result{}, fmt.Errorf("delete CRR error: %v", err)
}
return reconcile.Result{}, nil
}
}
return reconcile.Result{RequeueAfter: leftTime}, nil
}
// 如果pod有问题,则结束reconcile
if errors.IsNotFound(podErr) || pod.DeletionTimestamp != nil || string(pod.UID) != crr.Labels[appsv1alpha1.ContainerRecreateRequestPodUIDKey] {
klog.Warningf("Complete CRR %s/%s as failure for Pod %s with UID=%s has gone",
crr.Namespace, crr.Name, crr.Spec.PodName, crr.Labels[appsv1alpha1.ContainerRecreateRequestPodUIDKey])
return reconcile.Result{}, r.completeCRR(crr, "pod has gone")
}
duration := requeueduration.Duration{}
// status的phase结果是由kruise-daemon上报的,如果超过1min没有上报,则结束reconcile
if crr.Status.Phase == "" {
leftTime := responseTimeout - time.Since(crr.CreationTimestamp.Time)
if leftTime <= 0 {
klog.Warningf("Complete CRR %s/%s as failure for daemon has not responded for a long time", crr.Namespace, crr.Name)
return reconcile.Result{}, r.completeCRR(crr, "daemon has not responded for a long time")
}
duration.Update(leftTime)
}
// crr已经到了超时时间,则结束reconcile
if crr.Spec.ActiveDeadlineSeconds != nil {
leftTime := time.Duration(*crr.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(crr.CreationTimestamp.Time)
if leftTime <= 0 {
klog.Warningf("Complete CRR %s/%s as failure for recreating has exceeded the activeDeadlineSeconds", crr.Namespace, crr.Name)
return reconcile.Result{}, r.completeCRR(crr, "recreating has exceeded the activeDeadlineSeconds")
}
duration.Update(leftTime)
}
// crr的状态不是重建中,结束reconcile
if crr.Status.Phase != appsv1alpha1.ContainerRecreateRequestRecreating {
return reconcile.Result{RequeueAfter: duration.Get()}, nil
}
// 把pod和container信息写入status
if err := r.syncContainerStatuses(crr, pod); err != nil {
return reconcile.Result{}, fmt.Errorf("sync containerStatuses error: %v", err)
}
// 设置了平滑关闭,注入readiness probe到pod
if crr.Spec.Strategy.UnreadyGracePeriodSeconds != nil && crr.Annotations[appsv1alpha1.ContainerRecreateRequestUnreadyAcquiredKey] == "" {
if err = r.acquirePodNotReady(crr, pod); err != nil {
return reconcile.Result{}, err
}
}
return reconcile.Result{RequeueAfter: duration.Get()}, nil
}
// sync过程
func (r *ReconcileContainerRecreateRequest) syncContainerStatuses(crr *appsv1alpha1.ContainerRecreateRequest, pod *v1.Pod) error {
syncContainerStatuses := make([]appsv1alpha1.ContainerRecreateRequestSyncContainerStatus, 0, len(crr.Spec.Containers))
// 遍历spec的container
for i := range crr.Spec.Containers {
c := &crr.Spec.Containers[i]
// 拿到container当前的status
containerStatus := util.GetContainerStatus(c.Name, pod)
// 拿不到container的status,跳过
if containerStatus == nil {
klog.Warningf("Not found %s container in Pod Status for CRR %s/%s", c.Name, crr.Namespace, crr.Name)
continue
// container不在运行中,跳过
} else if containerStatus.State.Running == nil || containerStatus.State.Running.StartedAt.Before(&crr.CreationTimestamp) {
continue
}
// 构造status信息
syncContainerStatuses = append(syncContainerStatuses, appsv1alpha1.ContainerRecreateRequestSyncContainerStatus{
Name: containerStatus.Name,
Ready: containerStatus.Ready,
RestartCount: containerStatus.RestartCount,
ContainerID: containerStatus.ContainerID,
})
}
// 将status信息转成字符串
syncContainerStatusesStr := util.DumpJSON(syncContainerStatuses)
// 写入annotation,作为container重启前起始状态的标记
if crr.Annotations[appsv1alpha1.ContainerRecreateRequestSyncContainerStatusesKey] != syncContainerStatusesStr {
body := util.DumpJSON(syncPatchBody{Metadata: syncPatchMetadata{Annotations: map[string]string{appsv1alpha1.ContainerRecreateRequestSyncContainerStatusesKey: syncContainerStatusesStr}}})
return r.Patch(context.TODO(), crr, client.RawPatch(types.MergePatchType, []byte(body)))
}
return nil
}
kruise-daemon
整体流程
go
func (c *Controller) Run(stop <-chan struct{}) {
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()
klog.Info("Starting informer for ContainerRecreateRequest")
// 运行crr informer
go c.crrInformer.Run(stop)
if !cache.WaitForCacheSync(stop, c.crrInformer.HasSynced) {
return
}
klog.Infof("Starting crr daemon controller")
// 启动32个worker去运行队列中的任务
for i := 0; i < workers; i++ {
go wait.Until(func() {
for c.processNextWorkItem() {
}
}, time.Second, stop)
}
klog.Info("Started crr daemon controller successfully")
<-stop
}
// 执行队列中的任务过程
func (c *Controller) processNextWorkItem() bool {
// 从queue中拿出一个任务
// 有的同学会比较好奇, 为什么没有入队的操作, 就可以从队列中取item?
// 这里跟之前提到的informer有关. informer支持设置回调函数, 在创建crr controller时,
// informer的addFunc、updateFunc、deleteFunc回调函数中都进行了入队操作, 所以说这里
// 是可以取到item的
key, quit := c.queue.Get()
if quit {
return false
}
defer c.queue.Done(key)
// 进行sync
err := c.sync(key.(string))
// 如果执行不成功,则再次入队
if err == nil {
// 没有出现错误, 则清空这个item的重试次数, 并结束
c.queue.Forget(key)
} else {
// 如果出现错误, 则继续入队, 记录重试次数
// 当重试次数达到一定限制, 则忽略此item的入队请求
c.queue.AddRateLimited(key)
}
return true
}
其实 kruise-daemon 的实现参考了 Kubernetes Controller 的实现,拿 ReplicaSet Controller 来举例:
Controller 实际上是 worker 在进行 Control Loop,每个 worker 就是一个 goroutine,以并行的方式去 queue 取 item。
go
func (rsc *ReplicaSetController) worker() {
for rsc.processNextWorkItem() {
}
}
func (rsc *ReplicaSetController) processNextWorkItem() bool {
key, quit := rsc.queue.Get()
if quit {
return false
}
defer rsc.queue.Done(key)
err := rsc.syncHandler(key.(string))
if err == nil {
rsc.queue.Forget(key)
return true
}
utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err))
rsc.queue.AddRateLimited(key)
return true
}
sync流程
go
func (c *Controller) sync(key string) (retErr error) {
// 拿到namespace和podName
namespace, podName, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
klog.Warningf("Invalid key: %s", key)
return nil
}
// 根据podName构造label selector
sel, _ := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{MatchLabels: map[string]string{appsv1alpha1.ContainerRecreateRequestPodNameKey: podName}})
// 根据label selector获取crr list
crrList, err := c.crrLister.ContainerRecreateRequests(namespace).List(sel)
if err != nil {
klog.Errorf("Failed to list ContainerRecreateRequest for Pod %s: %v", key, err)
return err
}
// 如果crr list为空,结束sync
if len(crrList) == 0 {
return nil
}
// 筛选crr
crr, err := c.pickRecreateRequest(crrList)
if err != nil || crr == nil {
return err
}
// ...
// 更新crr的状态为recreating
if crr.Status.Phase != appsv1alpha1.ContainerRecreateRequestRecreating {
return c.updateCRRPhase(crr, appsv1alpha1.ContainerRecreateRequestRecreating)
}
// 设置了平滑重启
if crr.Spec.Strategy.UnreadyGracePeriodSeconds != nil {
// 从annotation中拿到平滑重启的时间
unreadyTimeStr := crr.Annotations[appsv1alpha1.ContainerRecreateRequestUnreadyAcquiredKey]
if unreadyTimeStr == "" {
klog.Infof("CRR %s/%s is waiting for unready acquirement.", crr.Namespace, crr.Name)
return nil
}
// 解析时间
unreadyTime, err := time.Parse(time.RFC3339, unreadyTimeStr)
// 如果解析失败,结束crr
if err != nil {
klog.Infof("CRR %s/%s failed to parse unready time %s: %v", crr.Namespace, crr.Name, unreadyTimeStr, err)
return c.completeCRRStatus(crr, fmt.Sprintf("failed to parse unready time %s: %v", unreadyTimeStr, err))
}
// 判断是否过了平滑重启的等待时间
leftTime := time.Duration(*crr.Spec.Strategy.UnreadyGracePeriodSeconds)*time.Second - time.Since(unreadyTime)
// 如果还没过,则继续等待,入队
if leftTime > 0 {
klog.Infof("CRR %s/%s is waiting for unready grace period %v left time.", crr.Namespace, crr.Name, leftTime)
c.queue.AddAfter(crr.Namespace+"/"+crr.Spec.PodName, leftTime+100*time.Millisecond)
return nil
}
}
// 进行manage
return c.manage(crr)
}
// 筛选crr的过程
func (c *Controller) pickRecreateRequest(crrList []*appsv1alpha1.ContainerRecreateRequest) (*appsv1alpha1.ContainerRecreateRequest, error) {
// 对crr list进行一个排序
// 按时间顺序升序排列,先创建的在前面
// 按phase状态排列,complete > recreate > pending
sort.Sort(crrListByPhaseAndCreated(crrList))
var picked *appsv1alpha1.ContainerRecreateRequest
// 遍历crr list
for _, crr := range crrList {
// [-------------------- ResourceVersionExpectation ----------------]
// 如果crr要被删除了, 或者crr已经完成了
if crr.DeletionTimestamp != nil || crr.Status.CompletionTime != nil {
// 从资源版本缓存中删除
resourceVersionExpectation.Delete(crr)
continue
}
// 保证资源版本缓存中的版本是最新的
resourceVersionExpectation.Observe(crr)
// 获得资源未修改持续时间
if satisfied, duration := resourceVersionExpectation.IsSatisfied(crr); !satisfied {
if duration < maxExpectationWaitDuration {
break
}
// 如果超过10s还没有修改,则进行警告
klog.Warningf("Wait for CRR %s/%s resourceVersion expectation over %v", crr.Namespace, crr.Name, duration)
// 从资源版本缓存中删除
resourceVersionExpectation.Delete(crr)
}
// [-------------------- ResourceVersionExpectation ----------------]
// 这里的代码说明了crr处理是串行化的
// 每次只会选择一个crr返回
if picked == nil {
picked = crr
// 把其他crr设为pending
} else if crr.Status.Phase == "" {
if err := c.updateCRRPhase(crr, appsv1alpha1.ContainerRecreateRequestPending); err != nil {
klog.Errorf("Failed to update CRR %s/%s status to Pending: %v", crr.Namespace, crr.Name, err)
return nil, err
}
}
}
return picked, nil
}
manage流程
go
func (c *Controller) manage(crr *appsv1alpha1.ContainerRecreateRequest) error {
// 创建一个runtimeManager,用来管理容器生命周期
runtimeManager, err := c.newRuntimeManager(c.runtimeFactory, crr)
// 创建失败则crr结束
if err != nil {
klog.Errorf("Failed to find runtime service for %s/%s: %v", crr.Namespace, crr.Name, err)
return c.completeCRRStatus(crr, fmt.Sprintf("failed to find runtime service: %v", err))
}
// 把crr的信息进行提取,构造成一个pod
pod := convertCRRToPod(crr)
// 使用runtimeManager去查该pod的status
podStatus, err := runtimeManager.GetPodStatus(pod.UID, pod.Name, pod.Namespace)
if err != nil {
return fmt.Errorf("failed to GetPodStatus %s/%s with uid %s: %v", pod.Namespace, pod.Name, pod.UID, err)
}
klog.V(5).Infof("CRR %s/%s for Pod %s GetPodStatus: %v", crr.Namespace, crr.Name, pod.Name, util.DumpJSON(podStatus))
// 拿到每个container的crr status
newCRRContainerRecreateStates := getCurrentCRRContainersRecreateStates(crr, podStatus)
// 如果container crr status有变化,则更新
if !reflect.DeepEqual(crr.Status.ContainerRecreateStates, newCRRContainerRecreateStates) {
return c.patchCRRContainerRecreateStates(crr, newCRRContainerRecreateStates)
}
var completedCount int
// 遍历每个container crr status
for i := range newCRRContainerRecreateStates {
state := &newCRRContainerRecreateStates[i]
switch state.Phase {
// 如果此container重建完成
case appsv1alpha1.ContainerRecreateRequestSucceeded:
// 重建结束计数++
completedCount++
continue
// 此container重建失败
case appsv1alpha1.ContainerRecreateRequestFailed:
// 重建结束计数++
completedCount++
// 如果配置了策略是Ignore,那么此container会一直尝试重启
if crr.Spec.Strategy.FailurePolicy == appsv1alpha1.ContainerRecreateRequestFailurePolicyIgnore {
continue
}
// 否则失败一次就结束此crr
return c.completeCRRStatus(crr, "")
case appsv1alpha1.ContainerRecreateRequestPending, appsv1alpha1.ContainerRecreateRequestRecreating:
}
// 如果是重建中状态
if state.Phase == appsv1alpha1.ContainerRecreateRequestRecreating {
// 如果需要按序重建,那么在等上一个container重启完成前,下一个container不会重启
if crr.Spec.Strategy.OrderedRecreate {
break
}
continue
}
// 从podStatus中找到container的status
kubeContainerStatus := podStatus.FindContainerStatusByName(state.Name)
// 找不到则退出
if kubeContainerStatus == nil {
break
}
msg := fmt.Sprintf("Stopping container %s by ContainerRecreateRequest %s", state.Name, crr.Name)
// 杀掉对应的container
// 从这里可以看出来crr重启容器是串行化的,因为在这个循环只运行在一个goroutine中
// 在一个goroutine一个一个杀container
// 也就是说,如果想要并发重启容器,需要创建多个CRR来完成
err := runtimeManager.KillContainer(pod, kubeContainerStatus.ID, state.Name, msg, nil)
if err != nil {
klog.Errorf("Failed to kill container %s in Pod %s/%s for CRR %s/%s: %v", state.Name, pod.Namespace, pod.Name, crr.Namespace, crr.Name, err)
// 如果kill失败,设置container状态为fail
state.Phase = appsv1alpha1.ContainerRecreateRequestFailed
state.Message = fmt.Sprintf("kill container error: %v", err)
// 如果是Ignore,则继续重启
if crr.Spec.Strategy.FailurePolicy == appsv1alpha1.ContainerRecreateRequestFailurePolicyIgnore {
continue
}
// 更新crr status
return c.patchCRRContainerRecreateStates(crr, newCRRContainerRecreateStates)
}
// 其他container,把phase修改成recreate
state.Phase = appsv1alpha1.ContainerRecreateRequestRecreating
break
}
// 更新status
if !reflect.DeepEqual(crr.Status.ContainerRecreateStates, newCRRContainerRecreateStates) {
return c.patchCRRContainerRecreateStates(crr, newCRRContainerRecreateStates)
}
// 如果所有container都重启完成,则结束crr
if completedCount == len(newCRRContainerRecreateStates) {
return c.completeCRRStatus(crr, "")
}
// 如果设置了minStartedSeconds,则继续入队
if crr.Spec.Strategy != nil && crr.Spec.Strategy.MinStartedSeconds > 0 {
c.queue.AddAfter(objectKey(crr), time.Duration(crr.Spec.Strategy.MinStartedSeconds)*time.Second)
}
return nil
}
// 获得所有container crr status的过程
func getCurrentCRRContainersRecreateStates(
crr *appsv1alpha1.ContainerRecreateRequest,
podStatus *kubeletcontainer.PodStatus,
) []appsv1alpha1.ContainerRecreateRequestContainerRecreateState {
var minStartedDuration time.Duration
// 设置minStartedSeconds
if crr.Spec.Strategy != nil {
minStartedDuration = time.Duration(crr.Spec.Strategy.MinStartedSeconds) * time.Second
}
// 从crr中获得container的status,这个status是sync过程中构建的,
// 从annotation中读出来的,相当于是container重建前的状态
syncContainerStatuses := getCRRSyncContainerStatuses(crr)
var statuses []appsv1alpha1.ContainerRecreateRequestContainerRecreateState
// 遍历crr spec的container
for i := range crr.Spec.Containers {
c := &crr.Spec.Containers[i]
// 获得crr的status,从crr的status key中获取的
previousContainerRecreateState := getCRRContainerRecreateState(crr, c.Name)
// 过滤phase为fail和success的container crr status
if previousContainerRecreateState != nil &&
(previousContainerRecreateState.Phase == appsv1alpha1.ContainerRecreateRequestFailed ||
previousContainerRecreateState.Phase == appsv1alpha1.ContainerRecreateRequestSucceeded) {
statuses = append(statuses, *previousContainerRecreateState)
continue
}
syncContainerStatus := syncContainerStatuses[c.Name]
kubeContainerStatus := podStatus.FindContainerStatusByName(c.Name)
var currentState appsv1alpha1.ContainerRecreateRequestContainerRecreateState
// 如果从podStatus中找不到对应的container
if kubeContainerStatus == nil {
currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{
Name: c.Name,
Phase: appsv1alpha1.ContainerRecreateRequestPending,
Message: "not found container on Node",
}
// 如果从podStatus中拿到的container status不是运行状态
} else if kubeContainerStatus.State != kubeletcontainer.ContainerStateRunning {
// for no-running state, we consider it will be recreated or restarted soon
currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{
Name: c.Name,
Phase: appsv1alpha1.ContainerRecreateRequestRecreating,
}
// 如果从podStatus获得的container id和crr中的container id不同,即镜像不一样
// 或者podStatus的container restart count比crr container的大
// 或者podStatus的container start time比crr container的晚
// 说明这个container重建过了
} else if kubeContainerStatus.ID.String() != c.StatusContext.ContainerID ||
kubeContainerStatus.RestartCount > int(c.StatusContext.RestartCount) ||
kubeContainerStatus.StartedAt.After(crr.CreationTimestamp.Time) {
// already recreated or restarted
currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{
Name: c.Name,
Phase: appsv1alpha1.ContainerRecreateRequestRecreating,
}
// 如果两者container id一样,并且container的持续运行时间比minStartedSeconds长
// 并且container已经ready了,则标记此container重建成功
if syncContainerStatus != nil &&
syncContainerStatus.ContainerID == kubeContainerStatus.ID.String() &&
time.Since(kubeContainerStatus.StartedAt) > minStartedDuration &&
syncContainerStatus.Ready {
currentState.Phase = appsv1alpha1.ContainerRecreateRequestSucceeded
}
// 其他情况,标记container为pending
} else {
currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{
Name: c.Name,
Phase: appsv1alpha1.ContainerRecreateRequestPending,
}
}
statuses = append(statuses, currentState)
}
return statuses
}