记一次 Kubectl 删除 Pod 的过程分析

前言

当你输入 kubectl delete pods podname 时，K8s 到底是如何将你的 pod 杀死的呢。这篇文章将从源码的角度，带你一起分析这个过程。

分析

当我们调用 Kubectl delete pods 命令的时候，kubectl 会调用 DELETE POD 的 API, APIServer 会更改 pod 的 DeletionTimestamp 和 DeletionGracePeriodSeconds 信息。然后 Kubelet 会进行处理。

下面从 Kubelet 的角度看看具体的实现。

func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
   syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
   select {
   case u, open := <-configCh:
      // Update from a config source; dispatch it to the right handler
      // callback.
      if !open {
         klog.ErrorS(nil, "Update channel is closed, exiting the sync loop")
         return false
      }

      switch u.Op {
      case kubetypes.ADD:
      case kubetypes.UPDATE:
         klog.V(2).InfoS("SyncLoop UPDATE", "source", u.Source, "pods", format.Pods(u.Pods))
         handler.HandlePodUpdates(u.Pods)
      case kubetypes.REMOVE:
         klog.V(2).InfoS("SyncLoop REMOVE", "source", u.Source, "pods", format.Pods(u.Pods))
         handler.HandlePodRemoves(u.Pods)
      case kubetypes.RECONCILE:
         klog.V(4).InfoS("SyncLoop RECONCILE", "source", u.Source, "pods", format.Pods(u.Pods))
         handler.HandlePodReconcile(u.Pods)
      case kubetypes.DELETE:
         klog.V(2).InfoS("SyncLoop DELETE", "source", u.Source, "pods", format.Pods(u.Pods))
         // DELETE is treated as a UPDATE because of graceful deletion.
         handler.HandlePodUpdates(u.Pods)
      case kubetypes.SET:
         // TODO: Do we want to support this?
         klog.ErrorS(nil, "Kubelet does not support snapshot update")
      default:
         klog.ErrorS(nil, "Invalid operation type received", "operation", u.Op)
      }

      kl.sourcesReady.AddSource(u.Source)
   return true
}

我们可以看到 syncLoopIteration 中 UPDATE 事件会调用 HandlePodUpdates 方法进行处理。

// HandlePodUpdates is the callback in the SyncHandler interface for pods
// being updated from a config source.
func (kl *Kubelet) HandlePodUpdates(pods []*v1.Pod) {
   start := kl.clock.Now()
   for _, pod := range pods {
      kl.podManager.UpdatePod(pod)
      if kubetypes.IsMirrorPod(pod) {
         kl.handleMirrorPod(pod, start)
         continue
      }
      mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod)
      kl.dispatchWork(pod, kubetypes.SyncPodUpdate, mirrorPod, start)
   }
}

// dispatchWork starts the asynchronous sync of the pod in a pod worker.
// If the pod has completed termination, dispatchWork will perform no action.
func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mirrorPod *v1.Pod, start time.Time) {
   // check whether we are ready to delete the pod from the API server (all status up to date)
   containersTerminal, podWorkerTerminal := kl.podAndContainersAreTerminal(pod)
   if pod.DeletionTimestamp != nil && containersTerminal {
      klog.V(4).InfoS("Pod has completed execution and should be deleted from the API server", "pod", klog.KObj(pod), "syncType", syncType)
      kl.statusManager.TerminatePod(pod)
      return
   }

   // optimization: avoid invoking the pod worker if no further changes are possible to the pod definition
   // (i.e. the pod has completed and its containers have been terminated)
   if podWorkerTerminal && containersTerminal {
      klog.V(4).InfoS("Pod has completed and its containers have been terminated, ignoring remaining sync work", "pod", klog.KObj(pod), "syncType", syncType)
      return
   }

   // Run the sync in an async worker.
   kl.podWorkers.UpdatePod(&UpdatePodOptions{
      Pod:        pod,
      MirrorPod:  mirrorPod,
      UpdateType: syncType,
      OnCompleteFunc: func(err error) {
         if err != nil {
            metrics.PodWorkerDuration.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start))
         }
      },
   })
   // Note the number of containers for new pods.
   if syncType == kubetypes.SyncPodCreate {
      metrics.ContainersPerPodCount.Observe(float64(len(pod.Spec.Containers)))
   }
}

// Apply the new setting to the specified pod.
// If the options provide an OnCompleteFunc, the function is invoked if the update is accepted.
// Update requests are ignored if a kill pod request is pending.
func (p *podWorkers) UpdatePod(options *UpdatePodOptions) {
   pod := options.Pod
   uid := pod.UID
   var podUpdates chan UpdatePodOptions
   var exists bool

   p.podLock.Lock()
   defer p.podLock.Unlock()
   if podUpdates, exists = p.podUpdates[uid]; !exists {
      // Creating a new pod worker either means this is a new pod, or that the
      // kubelet just restarted. In either case the kubelet is willing to believe
      // the status of the pod for the first pod worker sync. See corresponding
      // comment in syncPod.
      go func() {
         defer runtime.HandleCrash()
         p.managePodLoop(podUpdates)
      }()
   }
}

func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
   var lastSyncTime time.Time
   for update := range podUpdates {
      err := func() error {
         err = p.syncPodFn(syncPodOptions{
            mirrorPod:      update.MirrorPod,
            pod:            update.Pod,
            podStatus:      status,
            killPodOptions: update.KillPodOptions,
            updateType:     update.UpdateType,
         })
         lastSyncTime = time.Now()
         return err
      }()
   }
}

上面调用的 syncPodFn 方法，最终真正调用的是 kubelet.syncPod 方法。

func (kl *Kubelet) syncPod(o syncPodOptions) error {
   /////////////////////////////////////
   runnable := kl.canRunPod(pod)

   // Update status in the status manager
   kl.statusManager.SetPodStatus(pod, apiPodStatus)

   // Kill pod if it should not be running
   if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed {
      var syncErr error
      if err := kl.killPod(pod, nil, podStatus, nil); err != nil {
         kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
         syncErr = fmt.Errorf("error killing pod: %v", err)
         utilruntime.HandleError(syncErr)
      } else {
         if !runnable.Admit {
            // There was no error killing the pod, but the pod cannot be run.
            // Return an error to signal that the sync loop should back off.
            syncErr = fmt.Errorf("pod cannot be run: %s", runnable.Message)
         }
      }
      return syncErr
   }
   return nil
}

然后我们可以看到调用下面 kubelet.killPod 的方法进行处理：

// One of the following arguments must be non-nil: runningPod, status.
func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error {
   // Call the container runtime KillPod method which stops all running containers of the pod
   if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil {
      return err
   }
   return nil
}

然后具体的调用链如下：

// KillPod kills all the containers of a pod. Pod may be nil, running pod must not be.
// gracePeriodOverride if specified allows the caller to override the pod default grace period.
// only hard kill paths are allowed to specify a gracePeriodOverride in the kubelet in order to not corrupt user data.
// it is useful when doing SIGKILL for hard eviction scenarios, or max grace period during soft eviction scenarios.
func (m *kubeGenericRuntimeManager) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error {
   err := m.killPodWithSyncResult(pod, runningPod, gracePeriodOverride)
   return err.Error()
}

// killPodWithSyncResult kills a runningPod and returns SyncResult.
// Note: The pod passed in could be *nil* when kubelet restarted.
func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
   killContainerResults := m.killContainersWithSyncResult(pod, runningPod, gracePeriodOverride)
   for _, containerResult := range killContainerResults {
      result.AddSyncResult(containerResult)
   }

   // stop sandbox, the sandbox will be removed in GarbageCollect
   killSandboxResult := kubecontainer.NewSyncResult(kubecontainer.KillPodSandbox, runningPod.ID)
   result.AddSyncResult(killSandboxResult)
   // Stop all sandboxes belongs to same pod
   for _, podSandbox := range runningPod.Sandboxes {
      if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil {
         killSandboxResult.Fail(kubecontainer.ErrKillPodSandbox, err.Error())
         klog.ErrorS(nil, "Failed to stop sandbox", "podSandboxID", podSandbox.ID)
      }
   }

   return
}

// killContainersWithSyncResult kills all pod's containers with sync results.
func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (syncResults []*kubecontainer.SyncResult) {
   containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
   wg := sync.WaitGroup{}

   wg.Add(len(runningPod.Containers))
   for _, container := range runningPod.Containers {
      go func(container *kubecontainer.Container) {
         defer utilruntime.HandleCrash()
         defer wg.Done()

         killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
         if err := m.killContainer(pod, container.ID, container.Name, "", reasonUnknown, gracePeriodOverride); err != nil {
            killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
            klog.ErrorS(err, "Kill container failed", "pod", klog.KObj(pod), "podUID", pod.UID,
               "containerName", container.Name, "containerID", container.ID)
         }
         containerResults <- killContainerResult
      }(container)
   }
   return
}

看到这里，我们可以看到运行时会对 pod 中的每个 Container 发送 killContainer 命令。下面再看看这个函数的具体实现：

// killContainer kills a container through the following steps:
// * Run the pre-stop lifecycle hooks (if applicable).
// * Stop the container.
func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, reason containerKillReason, gracePeriodOverride *int64) error {
   // From this point, pod and container must be non-nil.
   gracePeriod := int64(minimumGracePeriodInSeconds)
   switch {
   case pod.DeletionGracePeriodSeconds != nil:
      gracePeriod = *pod.DeletionGracePeriodSeconds
   case pod.Spec.TerminationGracePeriodSeconds != nil:
      gracePeriod = *pod.Spec.TerminationGracePeriodSeconds

      if utilfeature.DefaultFeatureGate.Enabled(features.ProbeTerminationGracePeriod) {
         switch reason {
         case reasonStartupProbe:
            if containerSpec.StartupProbe != nil && containerSpec.StartupProbe.TerminationGracePeriodSeconds != nil {
               gracePeriod = *containerSpec.StartupProbe.TerminationGracePeriodSeconds
            }
         case reasonLivenessProbe:
            if containerSpec.LivenessProbe != nil && containerSpec.LivenessProbe.TerminationGracePeriodSeconds != nil {
               gracePeriod = *containerSpec.LivenessProbe.TerminationGracePeriodSeconds
            }
         }
      }
   }

   if len(message) == 0 {
      message = fmt.Sprintf("Stopping container %s", containerSpec.Name)
   }
   m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message)

   // Run internal pre-stop lifecycle hook
   if err := m.internalLifecycle.PreStopContainer(containerID.ID); err != nil {
      return err
   }

   // Run the pre-stop lifecycle hooks if applicable and if there is enough time to run it
   if containerSpec.Lifecycle != nil && containerSpec.Lifecycle.PreStop != nil && gracePeriod > 0 {
      gracePeriod = gracePeriod - m.executePreStopHook(pod, containerID, containerSpec, gracePeriod)
   }
   // always give containers a minimal shutdown window to avoid unnecessary SIGKILLs
   if gracePeriod < minimumGracePeriodInSeconds {
      gracePeriod = minimumGracePeriodInSeconds
   }

   err := m.runtimeService.StopContainer(containerID.ID, gracePeriod)
   return err
}

这里是 Kubelet 对删除 Container 的具体的实现逻辑。我们可以看到会有计算 gracePeriod 的值，如果有 PreStop 的 hook，则会先处理该 hook，然后再调用 m.runtimeService.StopContainer(containerID.ID, gracePeriod) 。接下来，我们去看看该方法的实现。

func (in instrumentedRuntimeService) StopContainer(containerID string, timeout int64) error {
   const operation = "stop_container"
   defer recordOperation(operation, time.Now())

   err := in.service.StopContainer(containerID, timeout)
   recordError(operation, err)
   return err
}

我们可以看到这里调用了运行时的 StopContainer 方法了。下面我们看看 dockershim 中这部分的实现。

// StopContainer stops a running container with a grace period (i.e., timeout).
func (ds *dockerService) StopContainer(_ context.Context, r *runtimeapi.StopContainerRequest) (*runtimeapi.StopContainerResponse, error) {
   err := ds.client.StopContainer(r.ContainerId, time.Duration(r.Timeout)*time.Second)
   if err != nil {
      return nil, err
   }
   return &r

看到这里，我们看到在以 dockerd 为运行时的时候，Kubelet 最终会调用到 Docker 提供的 Client 端的 StopContainer 接口（这也是 CRI 的标准接口）。

下面我们基于 dockerd moby 的实现，分析具体的删除步骤。

// ContainerStop looks for the given container and stops it.
// In case the container fails to stop gracefully within a time duration
// specified by the timeout argument, in seconds, it is forcefully
// terminated (killed).
//
// If the timeout is nil, the container's StopTimeout value is used, if set,
// otherwise the engine default. A negative timeout value can be specified,
// meaning no timeout, i.e. no forceful termination is performed.
func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
   container, err := daemon.GetContainer(name)
   if err != nil {
      return err
   }
   if !container.IsRunning() {
      return containerNotModifiedError{running: false}
   }
   if timeout == nil {
      stopTimeout := container.StopTimeout()
      timeout = &stopTimeout
   }
   if err := daemon.containerStop(container, *timeout); err != nil {
      return errdefs.System(errors.Wrapf(err, "cannot stop container: %s", name))
   }
   return nil
}

// containerStop sends a stop signal, waits, sends a kill signal.
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
   // TODO propagate a context down to this function
   ctx := context.TODO()
   if !container.IsRunning() {
      return nil
   }
   var wait time.Duration
   if seconds >= 0 {
      wait = time.Duration(seconds) * time.Second
   }
   success := func() error {
      daemon.LogContainerEvent(container, "stop")
      return nil
   }
   stopSignal := container.StopSignal()

   // 1. Send a stop signal
   err := daemon.killPossiblyDeadProcess(container, stopSignal)
   if err != nil {
      wait = 2 * time.Second
   }

   var subCtx context.Context
   var cancel context.CancelFunc
   if seconds >= 0 {
      subCtx, cancel = context.WithTimeout(ctx, wait)
   } else {
      subCtx, cancel = context.WithCancel(ctx)
   }
   defer cancel()

   if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
      // container did exit, so ignore any previous errors and return
      return success()
   }

   if err != nil {
      // the container has still not exited, and the kill function errored, so log the error here:
      logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
   }
   if seconds < 0 {
      // if the client requested that we never kill / wait forever, but container.Wait was still
      // interrupted (parent context cancelled, for example), we should propagate the signal failure
      return err
   }

   logrus.WithField("container", container.ID).Infof("Container failed to exit within %s of signal %d - using the force", wait, stopSignal)
   // Stop either failed or container didnt exit, so fallback to kill.
   if err := daemon.Kill(container); err != nil {
      // got a kill error, but give container 2 more seconds to exit just in case
      subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
      defer cancel()
      if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
         // container did exit, so ignore error and return
         return success()
      }
      logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
      return err
   }

   return success()
}

从这两个函数可以清楚的看到删除一个 container 的具体实现。首先会发一个 SIGTERM(15) 给 Container 中的 Init ProcessId。然后等待这个进程是否退出（正常情况下，这个时间就是 gracefulShutdown 的时间），如果退出了，就返回成功。如果失败了，则发送 SIGKILL(9) 命令，强制杀掉这个进程。

对于 SIGTERM 信号，如果程序中没有处理这个信号的话，则会使用系统内核自定义的处理函数，默认会杀掉该进程；如果程序处理了这个信号，及重写了处理函数，则会按照重写的处理函数处理该信号。

此外在 kubelet 中有一个 statusManager 在监控这个 POD 的状态。

func (m *manager) Start() {
   syncTicker := time.Tick(syncPeriod)
   // syncPod and syncBatch share the same go routine to avoid sync races.
   go wait.Forever(func() {
      for {
         select {
         case syncRequest := <-m.podStatusChannel:
            klog.V(5).InfoS("Status Manager: syncing pod with status from podStatusChannel",
               "podUID", syncRequest.podUID,
               "statusVersion", syncRequest.status.version,
               "status", syncRequest.status.status)
            m.syncPod(syncRequest.podUID, syncRequest.status)
         case <-syncTicker:
            klog.V(5).InfoS("Status Manager: syncing batch")
            // remove any entries in the status channel since the batch will handle them
            for i := len(m.podStatusChannel); i > 0; i-- {
               <-m.podStatusChannel
            }
            m.syncBatch()
         }
      }
   }, 0)
}

// syncPod syncs the given status with the API server. The caller must not hold the lock.
func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
   // We don't handle graceful deletion of mirror pods.
   if m.canBeDeleted(pod, status.status) {
      deleteOptions := metav1.DeleteOptions{
         GracePeriodSeconds: new(int64),
         // Use the pod UID as the precondition for deletion to prevent deleting a
         // newly created pod with the same name and namespace.
         Preconditions: metav1.NewUIDPreconditions(string(pod.UID)),
      }
      err = m.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, deleteOptions)
      if err != nil {
         klog.InfoS("Failed to delete status for pod", "pod", klog.KObj(pod), "err", err)
         return
      }
      klog.V(3).InfoS("Pod fully terminated and removed from etcd", "pod", klog.KObj(pod))
      m.deletePodStatus(uid)
   }
}

看到这个函数中，会调用 canBeDeleted 方法判断是否可以删除 POD，如果条件满足，则调用 DELETE 接口去删除这个 POD（这里不在 gracefulShutdown)。此时 APIServer 会在 ETCD 中立即删除 POD 资源。

下面看下 canBeDeleted 方法和内部调用方法 PodResourcesAreReclaimed 的实现：

func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
   if pod.DeletionTimestamp == nil || kubetypes.IsMirrorPod(pod) {
      return false
   }
   return m.podDeletionSafety.PodResourcesAreReclaimed(pod, status)
}

// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
// been reclaimed by the kubelet.  Reclaiming resources is a prerequisite to deleting a pod from the API server.
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
   if !notRunning(status.ContainerStatuses) {
      // We shouldn't delete pods that still have running containers
      klog.V(3).InfoS("Pod is terminated, but some containers are still running", "pod", klog.KObj(pod))
      return false
   }
   // pod's containers should be deleted
   runtimeStatus, err := kl.podCache.Get(pod.UID)
   if err != nil {
      klog.V(3).InfoS("Pod is terminated, Error getting runtimeStatus from the podCache", "pod", klog.KObj(pod), "err", err)
      return false
   }
   if len(runtimeStatus.ContainerStatuses) > 0 {
      var statusStr string
      for _, status := range runtimeStatus.ContainerStatuses {
         statusStr += fmt.Sprintf("%+v ", *status)
      }
      klog.V(3).InfoS("Pod is terminated, but some containers have not been cleaned up", "pod", klog.KObj(pod), "status", statusStr)
      return false
   }
   if kl.podVolumesExist(pod.UID) && !kl.keepTerminatedPodVolumes {
      // We shouldn't delete pods whose volumes have not been cleaned up if we are not keeping terminated pod volumes
      klog.V(3).InfoS("Pod is terminated, but some volumes have not been cleaned up", "pod", klog.KObj(pod))
      return false
   }
   if kl.kubeletConfiguration.CgroupsPerQOS {
      pcm := kl.containerManager.NewPodContainerManager()
      if pcm.Exists(pod) {
         klog.V(3).InfoS("Pod is terminated, but pod cgroup sandbox has not been cleaned up", "pod", klog.KObj(pod))
         return false
      }
   }
   return true
}

这个函数则是检查 pod 的相关资源是否被删除，POD 具备删除的条件。

所以看来下，graceful DELETE 一个 pod 的具体流程则是：cli 发起 graceful DELETE 操作 -> apiserver 更新 Pod 信息 -> kubelet 收到 update 事件，并优雅的杀死 Pod，并释放 Pod 资源 -> kubelet 请求立即删除 Pod -> apiserver 在 ETCD 中删除 Pod -> kubelet 最终 Pod 的资源清除。

总结

之前我的理解都是 K8s 发送的 SIGTERM 和 SIGKILL 信号去删除 Container 的，但是这次详细分析下来，原来最终发送信号的是 CRI 的实现者。而且个人对 LINUX 的信号处理的机制，并不是太了解，通过这次又重新补了一下这部分知识。另外真实的发现，带着问题去看源码的话，效率会高一点。

参考

Pod 删除流程 Kubernetes源码分析之Pod的删除

记一次 Kubectl 删除 Pod 的过程分析

前言

分析

总结

参考

CATALOG

FEATURED TAGS