0. 前言
在 Kubernetes
架構中,controller manager 是一個永不休止的控制迴路元件,其負責控制叢集資源的狀態。透過監控 kube-apiserver
的資源狀態,比較當前資源狀態和期望狀態,如果不一致,更新 kube-apiserver
的資源狀態以保持當前資源狀態和期望狀態一致。
1. kube-controller-manager
下面從原始碼角度分析 kube-controller-manager
的工作方式。
kube-controller-manager
使用 Cobra 作為應用命令列框架,和 kube-scheduler
,kube-apiserver
初始化過程類似,其流程如下:
這裡,簡要給出初始化程式碼示例:
# kubernetes/cmd/kube-controller-manager/app/controllermanager.go
func NewControllerManagerCommand() *cobra.Command {
// 建立選項
s, err := options.NewKubeControllerManagerOptions()
...
cmd := &cobra.Command{
...
RunE: func(cmd *cobra.Command, args []string) error {
...
// 根據選項,建立配置
c, err := s.Config(KnownControllers(), ControllersDisabledByDefault(), ControllerAliases())
if err != nil {
return err
}
...
return Run(context.Background(), c.Complete())
},
...
}
...
}
進入 Run
函式,看 kube-controller-manager
是怎麼執行的。
# kubernetes/cmd/kube-controller-manager/app/controllermanager.go
func Run(ctx context.Context, c *config.CompletedConfig) error {
...
run := func(ctx context.Context, controllerDescriptors map[string]*ControllerDescriptor) {
// 建立上下文
controllerContext, err := CreateControllerContext(logger, c, rootClientBuilder, clientBuilder, ctx.Done())
if err != nil {
logger.Error(err, "Error building controller context")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
// 開始控制器,這是主執行邏輯
if err := StartControllers(ctx, controllerContext, controllerDescriptors, unsecuredMux, healthzHandler); err != nil {
logger.Error(err, "Error starting controllers")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
// 啟動 informer
controllerContext.InformerFactory.Start(stopCh)
controllerContext.ObjectOrMetadataInformerFactory.Start(stopCh)
close(controllerContext.InformersStarted)
<-ctx.Done()
}
// No leader election, run directly
if !c.ComponentConfig.Generic.LeaderElection.LeaderElect {
// 建立控制器描述符
controllerDescriptors := NewControllerDescriptors()
controllerDescriptors[names.ServiceAccountTokenController] = saTokenControllerDescriptor
run(ctx, controllerDescriptors)
return nil
}
...
}
和 kube-scheduler
類似,kube-controller-manager
也是多副本單例項執行的元件,需要 leader election
作為 leader 元件執行。這裡不過多介紹,具體可參考 Kubernetes leader election 原始碼分析。
執行控制器管理器。首先,在 NewControllerDescriptors
中註冊資源控制器的描述符。
# kubernetes/cmd/kube-controller-manager/app/controllermanager.go
func NewControllerDescriptors() map[string]*ControllerDescriptor {
register := func(controllerDesc *ControllerDescriptor) {
...
controllers[name] = controllerDesc
}
...
// register 函式註冊資源控制器
register(newEndpointsControllerDescriptor())
register(newEndpointSliceControllerDescriptor())
register(newEndpointSliceMirroringControllerDescriptor())
register(newReplicationControllerDescriptor())
register(newPodGarbageCollectorControllerDescriptor())
register(newResourceQuotaControllerDescriptor())
...
return controllers
}
# kubernetes/cmd/kube-controller-manager/app/apps.go
func newReplicaSetControllerDescriptor() *ControllerDescriptor {
return &ControllerDescriptor{
name: names.ReplicaSetController,
aliases: []string{"replicaset"},
initFunc: startReplicaSetController,
}
}
每個資源控制器描述符包括 initFunc
和啟動控制器函式的對映。
在 run
中 StartControllers
執行控制器。
# kubernetes/cmd/kube-controller-manager/app/controllermanager.go
func StartControllers(ctx context.Context, controllerCtx ControllerContext, controllerDescriptors map[string]*ControllerDescriptor,
unsecuredMux *mux.PathRecorderMux, healthzHandler *controllerhealthz.MutableHealthzHandler) error {
...
// 遍歷獲取資源控制器描述符
for _, controllerDesc := range controllerDescriptors {
if controllerDesc.RequiresSpecialHandling() {
continue
}
// 執行資源控制器
check, err := StartController(ctx, controllerCtx, controllerDesc, unsecuredMux)
if err != nil {
return err
}
if check != nil {
// HealthChecker should be present when controller has started
controllerChecks = append(controllerChecks, check)
}
}
...
return nil
}
func StartController(ctx context.Context, controllerCtx ControllerContext, controllerDescriptor *ControllerDescriptor,
unsecuredMux *mux.PathRecorderMux) (healthz.HealthChecker, error) {
...
// 獲取資源控制器描述符的啟動函式
initFunc := controllerDescriptor.GetInitFunc()
// 啟動資源控制器
ctrl, started, err := initFunc(klog.NewContext(ctx, klog.LoggerWithName(logger, controllerName)), controllerCtx, controllerName)
if err != nil {
logger.Error(err, "Error starting controller", "controller", controllerName)
return nil, err
}
...
}
kubernetes
有多個控制器,這裡以 Replicaset
控制器為例,介紹控制器是怎麼執行的。
進入 Replicaset
控制器的 initFunc
函式執行控制器。
# kubernetes/cmd/kube-controller-manager/app/apps.go
func startReplicaSetController(ctx context.Context, controllerContext ControllerContext, controllerName string) (controller.Interface, bool, error) {
go replicaset.NewReplicaSetController(
klog.FromContext(ctx),
controllerContext.InformerFactory.Apps().V1().ReplicaSets(),
controllerContext.InformerFactory.Core().V1().Pods(),
controllerContext.ClientBuilder.ClientOrDie("replicaset-controller"),
replicaset.BurstReplicas,
).Run(ctx, int(controllerContext.ComponentConfig.ReplicaSetController.ConcurrentRSSyncs))
return nil, true, nil
}
執行 initFunc
實際上執行的是 startReplicaSetController
。startReplicaSetController
啟動一個 goroutine
執行 replicaset.NewReplicaSetController
和 ReplicaSetController.Run
,replicaset.NewReplicaSetController
建立了 informer
的 Eventhandler
,ReplicaSetController.Run
負責對 EventHandler
中加入佇列的資源做處理。示意圖如下:
首先,進入 replicaset.NewReplicaSetController
檢視函式做了什麼。
# kubernetes/pkg/controller/replicaset/replica_set.go
func NewReplicaSetController(logger klog.Logger, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int) *ReplicaSetController {
...
return NewBaseController(logger, rsInformer, podInformer, kubeClient, burstReplicas,
apps.SchemeGroupVersion.WithKind("ReplicaSet"),
"replicaset_controller",
"replicaset",
controller.RealPodControl{
KubeClient: kubeClient,
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "replicaset-controller"}),
},
eventBroadcaster,
)
}
func NewBaseController(logger klog.Logger, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int,
gvk schema.GroupVersionKind, metricOwnerName, queueName string, podControl controller.PodControlInterface, eventBroadcaster record.EventBroadcaster) *ReplicaSetController {
rsc := &ReplicaSetController{
GroupVersionKind: gvk,
kubeClient: kubeClient,
podControl: podControl,
eventBroadcaster: eventBroadcaster,
burstReplicas: burstReplicas,
expectations: controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), queueName),
}
rsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
rsc.addRS(logger, obj)
},
UpdateFunc: func(oldObj, newObj interface{}) {
rsc.updateRS(logger, oldObj, newObj)
},
DeleteFunc: func(obj interface{}) {
rsc.deleteRS(logger, obj)
},
})
...
podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
rsc.addPod(logger, obj)
},
UpdateFunc: func(oldObj, newObj interface{}) {
rsc.updatePod(logger, oldObj, newObj)
},
DeleteFunc: func(obj interface{}) {
rsc.deletePod(logger, obj)
},
})
...
rsc.syncHandler = rsc.syncReplicaSet
return rsc
}
函式定義了 ReplicaSetController
和 podInformer
,負責監控 kube-apiserver
中 ReplicaSet
和 Pod
的變化,根據資源的不同變動觸發對應的 Event Handler
。
接著,進入 Run
檢視函式做了什麼。
# kubernetes/pkg/controller/replicaset/replica_set.go
func (rsc *ReplicaSetController) Run(ctx context.Context, workers int) {
...
// 同步快取和 kube-apiserver 中獲取的資源
if !cache.WaitForNamedCacheSync(rsc.Kind, ctx.Done(), rsc.podListerSynced, rsc.rsListerSynced) {
return
}
for i := 0; i < workers; i++ {
// worker 負責處理佇列中的資源
go wait.UntilWithContext(ctx, rsc.worker, time.Second)
}
<-ctx.Done()
}
func (rsc *ReplicaSetController) worker(ctx context.Context) {
// worker 是永不停止的
for rsc.processNextWorkItem(ctx) {
}
}
func (rsc *ReplicaSetController) processNextWorkItem(ctx context.Context) bool {
// 讀取佇列中的資源
key, quit := rsc.queue.Get()
if quit {
return false
}
defer rsc.queue.Done(key)
// 處理佇列中的資源
err := rsc.syncHandler(ctx, key.(string))
if err == nil {
rsc.queue.Forget(key)
return true
}
...
return true
}
可以看到,rsc.syncHandler
處理佇列中的資源,rsc.syncHandler
實際執行的是 ReplicaSetController.syncReplicaSet
。
理清了程式碼的結構,我們以一個刪除 Pod
示例看 kube-controller-manager
是怎麼執行的。
1.1 刪除 Pod 示例
1.1.1 示例條件
建立 Replicaset
如下:
# helm list
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
test default 1 2024-02-29 16:24:43.896757193 +0800 CST deployed test-0.1.0 1.16.0
# kubectl get replicaset
NAME DESIRED CURRENT READY AGE
test-6d47479b6b 1 1 1 10d
# kubectl get pods
NAME READY STATUS RESTARTS AGE
test-6d47479b6b-5k6cb 1/1 Running 0 9d
刪除 pod 檢視 kube-controller-manager
是怎麼執行的。
1.1.2 執行流程
刪除 pod:
# kubectl delete pods test-6d47479b6b-5k6cb
刪除 pod 後,podInformer
的 Event handler
接受到 pod 的變化,呼叫 ReplicaSetController.deletePod
函式:
func (rsc *ReplicaSetController) deletePod(logger klog.Logger, obj interface{}) {
pod, ok := obj.(*v1.Pod)
...
logger.V(4).Info("Pod deleted", "delete_by", utilruntime.GetCaller(), "deletion_timestamp", pod.DeletionTimestamp, "pod", klog.KObj(pod))
...
rsc.queue.Add(rsKey)
}
ReplicaSetController.deletePod
將刪除的 pod 加入到佇列中。接著,worker 中的 ReplicaSetController.processNextWorkItem
從佇列中獲取刪除的 pod,進入 ReplicaSetController.syncReplicaSet
處理。
func (rsc *ReplicaSetController) syncReplicaSet(ctx context.Context, key string) error {
...
namespace, name, err := cache.SplitMetaNamespaceKey(key)
...
// 獲取 pod 對應的 replicaset
rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
...
// 獲取所有 pod
allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
if err != nil {
return err
}
// Ignore inactive pods.
filteredPods := controller.FilterActivePods(logger, allPods)
// 獲取 replicaset 下的 pod
// 這裡 pod 被刪掉了,filteredPods 為 0
filteredPods, err = rsc.claimPods(ctx, rs, selector, filteredPods)
if err != nil {
return err
}
// replicaset 下的 pod 被刪除
// 進入 rsc.manageReplicas
var manageReplicasErr error
if rsNeedsSync && rs.DeletionTimestamp == nil {
manageReplicasErr = rsc.manageReplicas(ctx, filteredPods, rs)
}
...
}
繼續進入 ReplicaSetController.manageReplicas
:
func (rsc *ReplicaSetController) manageReplicas(ctx context.Context, filteredPods []*v1.Pod, rs *apps.ReplicaSet) error {
diff := len(filteredPods) - int(*(rs.Spec.Replicas))
...
if diff < 0 {
logger.V(2).Info("Too few replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "creating", diff)
...
successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error {
err := rsc.podControl.CreatePods(ctx, rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind))
if err != nil {
if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
// if the namespace is being terminated, we don't have to do
// anything because any creation will fail
return nil
}
}
return err
})
...
}
...
}
當 filteredPods
小於 Replicaset 中 spec
域定義的 Replicas
時,進入 rsc.podControl.CreatePods
建立 pod:
func (r RealPodControl) CreatePods(ctx context.Context, namespace string, template *v1.PodTemplateSpec, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error {
return r.CreatePodsWithGenerateName(ctx, namespace, template, controllerObject, controllerRef, "")
}
func (r RealPodControl) CreatePodsWithGenerateName(ctx context.Context, namespace string, template *v1.PodTemplateSpec, controllerObject runtime.Object, controllerRef *metav1.OwnerReference, generateName string) error {
...
return r.createPods(ctx, namespace, pod, controllerObject)
}
func (r RealPodControl) createPods(ctx context.Context, namespace string, pod *v1.Pod, object runtime.Object) error {
...
newPod, err := r.KubeClient.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
...
logger.V(4).Info("Controller created pod", "controller", accessor.GetName(), "pod", klog.KObj(newPod))
...
return nil
}
接著,回到 ReplicaSetController.syncReplicaSet
:
func (rsc *ReplicaSetController) syncReplicaSet(ctx context.Context, key string) error {
...
newStatus := calculateStatus(rs, filteredPods, manageReplicasErr)
updatedRS, err := updateReplicaSetStatus(logger, rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace), rs, newStatus)
if err != nil {
return err
}
...
}
雖然 pod 重建過,不過這裡的 filteredPods
是 0,updateReplicaSetStatus
會更新 Replicaset 的當前狀態為 0。
更新了 Replicaset
的狀態又會觸發 Replicaset
的 Event Handler
,從而再次進入 ReplicaSetController.syncReplicaSet
。這時,如果 pod 重建完成,filteredPods
將過濾出重建的 pod,呼叫 updateReplicaSetStatus
更新 Replicaset
的當前狀態到期望狀態。
2. 小結
本文介紹了 kube-controller-manager
的執行流程,並且從一個刪除 pod 的示例入手,看 kube-controller-manager
是如何控制資源狀態的。