admin管理员组

文章数量:1530059

__schedule()是调度器的核心函数,功能是:让调度器选择一个合适的进程并切换到对应的线程运行。本文重点来分析一下该函数。

文章目录

        • 一、进程调度的开始—schedule()
        • 二、进程调度的承担者—__schedule()
          • (2-1)进程调度的“眼睛”—pick_next_task()
          • (2-2)进程调度的本质—context_switch()
            • (2-2-1)进程调度的铺路—switch_mm()
            • (2-2-2)进程调度的执行者—switch_to()
        • 三、函数调用关系图

一、进程调度的开始—schedule()

出自(./linux/kernel/sched/core.c)文件
在Linux下,常使用的核心调度入口是schedule,如下定义:


static inline void sched_submit_work(struct task_struct *tsk)
{
	if (!tsk->state || tsk_is_pi_blocked(tsk))
		return;
	if (blk_needs_flush_plug(tsk))
		blk_schedule_flush_plug(tsk);
}

asmlinkage __visible void __sched schedule(void)
{
	struct task_struct *tsk = current;

	sched_submit_work(tsk);
	do {
		__schedule();
	} while (need_resched());
}
二、进程调度的承担者—__schedule()

__schedule为Linux内核调度器的调度的核心功能函数,

static void __sched __schedule(void)
{
	struct task_struct *prev, *next;
	unsigned long *switch_count;
	struct rq *rq;
	int cpu;

	preempt_disable();
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	rcu_note_context_switch();
	prev = rq->curr;

	schedule_debug(prev);

	if (sched_feat(HRTICK))
		hrtick_clear(rq);

	smp_mb__before_spinlock();
	raw_spin_lock_irq(&rq->lock);

	rq->clock_skip_update <<= 1; /* promote REQ to ACT */

	switch_count = &prev->nivcsw;
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
		if (unlikely(signal_pending_state(prev->state, prev))) {
			prev->state = TASK_RUNNING;
		} else {
			deactivate_task(rq, prev, DEQUEUE_SLEEP);
			prev->on_rq = 0;

			if (prev->flags & PF_WQ_WORKER) {
				struct task_struct *to_wakeup;

				to_wakeup = wq_worker_sleeping(prev, cpu);
				if (to_wakeup)
					try_to_wake_up_local(to_wakeup);
			}
		}
		switch_count = &prev->nvcsw;
	}

	if (task_on_rq_queued(prev))
		update_rq_clock(rq);

	next = pick_next_task(rq, prev);
	clear_tsk_need_resched(prev);
	clear_preempt_need_resched();
	rq->clock_skip_update = 0;

	if (likely(prev != next)) {
		rq->nr_switches++;
		rq->curr = next;
		++*switch_count;

		rq = context_switch(rq, prev, next); /* unlocks the rq */
		cpu = cpu_of(rq);
	} else
		raw_spin_unlock_irq(&rq->lock);

	post_schedule(rq);

	sched_preempt_enable_no_resched();
}

以上代码中,pick_next_task()函数功能是:让进程调度器从就绪队列中选择一个最合适的next进程,然后调用context_switch()切换到next进程运行。

(2-1)进程调度的“眼睛”—pick_next_task()

出自(/kernel/sched/core.c):

static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
	const struct sched_class *class = &fair_sched_class;
	struct task_struct *p;
	
    //如果所有的进程都在fair class中,那么直接调用
	if (likely(prev->sched_class == class &&
		   rq->nr_running == rq->cfs.h_nr_running)) {
		p = fair_sched_class.pick_next_task(rq, prev);
		if (unlikely(p == RETRY_TASK))
			goto again;

		if (unlikely(!p))
			p = idle_sched_class.pick_next_task(rq, prev);

		return p;
	}

again:
	for_each_class(class) {
		p = class->pick_next_task(rq, prev);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;
		}
	}
    
	//idle空闲调度类总是需要一个可运行的任务。
	BUG(); 
}

上述代码中8~18行是一个优化方案:如果当前进程prve的调度类是CFS,并且该CPU整个就绪队列rq中的进程数量等于CFS就绪队列中进程数量,就说明该CPU就绪队列中只有普通进程没有其他调度类的进程。

否则,就需要遍历整个调度类链表

for_each_class(class)使用以下宏替换后:

#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)

得到:

 for (class = &stop_sched_class; class; class = class->next){
		p = class->pick_next_task(rq, prev);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;
		}
	}
    

以上代码实则是对5个由next串接成的链表进行遍历,调用每个类中的pick_next_task()选择下一个被调度的进程,在遍历过程中,调度类的优先级为:

stop_sched_class类用于关闭CPU;dl_sched_classrt_sched_class类是实时性进程的调度类,所以当系统中存在实时进程时,它们将被优先执行。

pick_next_task是一个函数指针,原型如下:

	struct task_struct * (*pick_next_task) (struct rq *rq,struct task_struct *prev);

​ 在5个调度类中每个都有一个对应的pick_next_task函数,该函数将选择出下一个将要调度的进程,返回指向结构体task_struct的指针。

(2-2)进程调度的本质—context_switch()

出自(/kernel/sched/core.c):

static inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next)
{

	struct mm_struct *mm, *oldmm;
	
    //准备任务切换
	prepare_task_switch(rq, prev, next);

	mm = next->mm;
	oldmm = prev->active_mm; 

	arch_start_context_switch(prev);

	if (!mm) {
		next->active_mm = oldmm;
		atomic_inc(&oldmm->mm_count);
		enter_lazy_tlb(oldmm, next);
	} else
		switch_mm(oldmm, mm, next);

	if (!prev->mm) {
		prev->active_mm = NULL;
		rq->prev_mm = oldmm;
	}

	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

	context_tracking_task_switch(prev, next);
	/* Here we just switch the register state and the stack. */
	switch_to(prev, next, prev);
	barrier();

	return finish_task_switch(prev);
}

上述15~20行代码中,如果nextmm成员为空,则说明这是一个内核线程;对于普通进程,需要调用switch_mm函数进行一些进程地址空间切换的处理。

(2-2-1)进程调度的铺路—switch_mm()
static inline void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
	  struct task_struct *tsk)
{
#ifdef CONFIG_MMU
	unsigned int cpu = smp_processor_id();
	if (cache_ops_need_broadcast() &&
	    !cpumask_empty(mm_cpumask(next)) &&
	    !cpumask_test_cpu(cpu, mm_cpumask(next)))
		__flush_icache_all();

	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) {
		check_and_switch_context(next, tsk);
		if (cache_is_vivt())
			cpumask_clear_cpu(cpu, mm_cpumask(prev));
	}
#endif
}

switch_mm()函数将当前的CPU设置到下一个进程的cpumask位图中,然后调用check_and_switch_context()函数来完成ARM体系结构相关的硬件设置。check_and_switch_context()代码量后续还很长,还有许多与具体硬件相关的知识。这里暂时打住啦。。。。

(2-2-2)进程调度的执行者—switch_to()

switch_to函数是新旧进程的切换点,从prev进程切换到next进程运行,当该函数执行完成时,CPU运行next进程,prev进程将被切换出去,俗称“睡眠”啦。

switch_to函数也与具体的架构相关,大部分使用汇编语言来设计。就大概停在此处啦,容后续再分析。。。。

三、函数调用关系图

本文标签: 内核函数进程LinuxKernel