Kubernetes Scheduler源码分析--启动过程与多队列缓存(续)

继续上文对Scheduler的分析，分析在Scheduler主循环处理过程中，podQueue,Queue和assumePod 三个队列的处理。Scheduler中SchedulerOne为主要的处理函数，其源代码为func (s *Scheduler) scheduleOne() {pod := s.config.NextPod()if pod.DeletionTim

程序源234

11959人浏览 · 2017-05-28 16:30:15

程序源234 · 2017-05-28 16:30:15 发布

继续上文对Scheduler的分析，分析在Scheduler主循环处理过程中，podQueue,Queue和assumePod 三个队列的处理。

Scheduler中SchedulerOne为主要的处理函数，其源代码为

func (s *Scheduler) scheduleOne() {
	pod := s.config.NextPod()
	if pod.DeletionTimestamp != nil {
		s.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
		glog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
		return
	}

	glog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name)
	start := time.Now()
	dest, err := s.config.Algorithm.Schedule(pod, s.config.NodeLister)
	if err != nil {
		glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name)
		s.config.Error(pod, err)
		s.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err)
		s.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
			Type:    v1.PodScheduled,
			Status:  v1.ConditionFalse,
			Reason:  v1.PodReasonUnschedulable,
			Message: err.Error(),
		})
		return
	}
	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))

	// Optimistically assume that the binding will succeed and send it to apiserver
	// in the background.
	// If the binding fails, scheduler will release resources allocated to assumed pod
	// immediately.
	assumed := *pod
	assumed.Spec.NodeName = dest
	if err := s.config.SchedulerCache.AssumePod(&assumed); err != nil {
		glog.Errorf("scheduler cache AssumePod failed: %v", err)
		// TODO: This means that a given pod is already in cache (which means it
		// is either assumed or already added). This is most probably result of a
		// BUG in retrying logic. As a temporary workaround (which doesn't fully
		// fix the problem, but should reduce its impact), we simply return here,
		// as binding doesn't make sense anyway.
		// This should be fixed properly though.
		return
	}

	go func() {
		defer metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))

		b := &v1.Binding{
			ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
			Target: v1.ObjectReference{
				Kind: "Node",
				Name: dest,
			},
		}

		bindingStart := time.Now()
		// If binding succeeded then PodScheduled condition will be updated in apiserver so that
		// it's atomic with setting host.
		err := s.config.Binder.Bind(b)
		if err := s.config.SchedulerCache.FinishBinding(&assumed); err != nil {
			glog.Errorf("scheduler cache FinishBinding failed: %v", err)
		}
		if err != nil {
			glog.V(1).Infof("Failed to bind pod: %v/%v", pod.Namespace, pod.Name)
			if err := s.config.SchedulerCache.ForgetPod(&assumed); err != nil {
				glog.Errorf("scheduler cache ForgetPod failed: %v", err)
			}
			s.config.Error(pod, err)
			s.config.Recorder.Eventf(pod, v1.EventTypeNormal, "FailedScheduling", "Binding rejected: %v", err)
			s.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
				Type:   v1.PodScheduled,
				Status: v1.ConditionFalse,
				Reason: "BindingRejected",
			})
			return
		}
		metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
		s.config.Recorder.Eventf(pod, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", pod.Name, dest)
	}()
}

其中主要的步骤包括：

(1) 从podQueue队列中，去除nextPod

(2) 利用调度算法进行调度

(3) 将调度的Pod信息以及对应的Node信息，写入到SchedulerCache.AssumePod中

(4) 记性Binding操作

其中异常流程的处理有：

(1)如果调度失败了，则执行Error函数(Error函数指针在初始化的时候赋值为) f.MakeDefaultErrorFunc(podBackoff, f.podQueue),并且Recorder.Eventf和PodConditionUpdater

(2)如果binding失败，则SchedulerCache.ForgetPod(&assumed),同时重复步骤(1)

在处理Pod信息处理后，Pod的信息是标记在AssumePod中，这时有两条路径会对AssumePod中的Pod进行处理

路径1：在Queue队列中，通过监听已经调度成功的Pod信息对AssumePod中Pod信息进行刷新。相关的代码：

func (c *controller) processLoop() {
	for {
		obj, err := c.config.Queue.Pop(PopProcessFunc(c.config.Process))
		if err != nil {
			if err == FIFOClosedError {
				return
			}
			if c.config.RetryOnError {
				// This is the safe way to re-enqueue.
				c.config.Queue.AddIfNotPresent(obj)
			}
		}
	}
}

Process函数指针中处理函数为：

		Process: func(obj interface{}) error {
			// from oldest to newest
			for _, d := range obj.(Deltas) {
				switch d.Type {
				case Sync, Added, Updated:
					if old, exists, err := clientState.Get(d.Object); err == nil && exists {
						if err := clientState.Update(d.Object); err != nil {
							return err
						}
						h.OnUpdate(old, d.Object)
					} else {
						if err := clientState.Add(d.Object); err != nil {
							return err
						}
						h.OnAdd(d.Object)
					}
				case Deleted:
					if err := clientState.Delete(d.Object); err != nil {
						return err
					}
					h.OnDelete(d.Object)
				}
			}
			return nil
		},

其中onAdd，onUpdate和OnDelete为函数指针参数，外层传入的参数为：

		cache.ResourceEventHandlerFuncs{
			AddFunc:    c.addPodToCache,
			UpdateFunc: c.updatePodInCache,
			DeleteFunc: c.deletePodFromCache,
		},

在ConfigFactory中对应的处理为(以Add为例)：

// TODO(harryz) need to update all the handlers here and below for equivalence cache
func (c *ConfigFactory) addPodToCache(obj interface{}) {
	pod, ok := obj.(*v1.Pod)
	if !ok {
		glog.Errorf("cannot convert to *v1.Pod: %v", obj)
		return
	}

	if err := c.schedulerCache.AddPod(pod); err != nil {
		glog.Errorf("scheduler cache AddPod failed: %v", err)
	}
}

从而实现为assumePods的更新

路径2：在schedulerCache中cleanupExpiredAssumedPods，定时的轮询，清理过期的Pod

// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {
	cache.mu.Lock()
	defer cache.mu.Unlock()

	// The size of assumedPods should be small
	for key := range cache.assumedPods {
		ps, ok := cache.podStates[key]
		if !ok {
			panic("Key found in assumed set but not in podStates. Potentially a logical error.")
		}
		if !ps.bindingFinished {
			glog.Warningf("Couldn't expire cache for pod %v/%v. Binding is still in progress.",
				ps.pod.Namespace, ps.pod.Name)
			continue
		}
		if now.After(*ps.deadline) {
			glog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name)
			if err := cache.expirePod(key, ps); err != nil {
				glog.Errorf("ExpirePod failed for %s: %v", key, err)
			}
		}
	}
}

这个函数在一个独立的协程中，协程的启动为schedulerCache创建时

func New(ttl time.Duration, stop <-chan struct{}) Cache {
	cache := newSchedulerCache(ttl, cleanAssumedPeriod, stop)
	cache.run()
	return cache
}

func (cache *schedulerCache) run() {
	go wait.Until(cache.cleanupExpiredAssumedPods, cache.period, cache.stop)
}

总结：

整体上Schedule模块，对应Pod的处理，基于list-watch机制，获取哪写Pod需要调度。然后将信息存入到queuePods缓存队列中，然后在ScheduleOne函数中，将Pod信息取出，利用调度算法进行调度。

完成调度后，将Pod的信息存入ScheduleCache的assumePods队列中再次缓存

再通过list-watch机制监听已经被调度的Pod的信息，放入到Queue队列中，在Controller的协程中，同步的更新assumePods队列中的数据进行对账处理

CSDN学习社区

CSDN联合极客时间，共同打造面向开发者的精品内容学习社区，助力成长！

更多推荐

1 小时解读鸿蒙 10 大热点问题

CSDN学习社区

用 OpenAI Assistants 做大模型应用开发

CSDN学习社区

1 小时解读鸿蒙 10 大热点问题

CSDN学习社区

所有评论(0)

查看更多评论

程序源234

@yan234280533

已为社区贡献4条内容