linux内核上下文切换解析

用户4415180

发布于 2022-06-23 14:29:16

1.2K0

发布于 2022-06-23 14:29:16

文章被收录于专栏：高并发

linux的上下文切换就是进程线程的切换，也就是切换struct task_struct结构体，一个任务的上下文包括cpu的寄存器，内核栈等，由于1个cpu上的所有任务共享一套寄存器，所以在任务挂起的时候需要保存寄存器，当任务重新被调度执行的时候需要恢复寄存器。每种处理器都提供了硬件级别的上下文切换，比如x86架构下的TSS段，TSS段包括了一个任务执行的所需要的所有上下文，主要有：1.通用寄存器和段寄存器。2.标志寄存器EFLAGS,程序指针EIP，页表基地址寄存器CR3，任务寄存器和LDTR寄存器。3.I/O映射位图基地址和I/O位图信息。4.特权级0，1，2堆栈指针。5.链接到前一任务的链指针。所以上下文切换也很简单，直接用call或者jmp指令调度任务。同样ARM架构也有快速上下文切换技术。但是Linux为了适用更多的cpu架构没使用处理器相关的上下文切换技术，而是大部分通过软件实现。linux上下文切换就在schedule（）函数里，很多地方都会调用这个函数。scchedule函数前面大部分代码是和调度算法相关的，比如实时任务调度算法，O(1)调度算法（2.6.22版本被CFS调度算法取代），CFS调度算法等。经过前面的代码计算后找出下一个要执行的任务，然后开始执行上下文切换。先看一段linux2.6.18版本还使用O(1)调度算法的schedule函数代码：

/*
 * schedule() is the main scheduler function.
 */
asmlinkage void __sched schedule(void)
{
	struct task_struct *prev, *next;//当前任务和将要执行的任务
	struct prio_array *array;    //优先队列
	struct list_head *queue;     //队列
	unsigned long long now;      //系统当前时间
	unsigned long run_time;     //运行时间
	int cpu, idx, new_prio;  
	long *switch_count;        //切换次数
	struct rq *rq;            //当前cpu运行队列

	/*
	 * Test if we are atomic.  Since do_exit() needs to call into
	 * schedule() atomically, we ignore that path for now.
	 * Otherwise, whine if we are scheduling when we should not be.
	 */
	if (unlikely(in_atomic() && !current->exit_state)) {
		printk(KERN_ERR "BUG: scheduling while atomic: "
			"%s/0x%08x/%d\n",
			current->comm, preempt_count(), current->pid);
		dump_stack();
	}
	profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
	preempt_disable();   //关闭抢占
	prev = current;      //prev等于当前进程
	release_kernel_lock(prev); //释放当前进程的内核锁
need_resched_nonpreemptible:
	rq = this_rq();     //获取当前cpu的运行队列

	/*
	 * 如果当前进程是idel进程，并且idel进程不是运行态则是bug，因为
	 * 是idel进程说明运行队列为空，而此cpu还在运行，所以进程不可能是非运行态
	 */
	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
		dump_stack();
	}
	//调度统计
	schedstat_inc(rq, sched_cnt);
	//读取系统时间，也就是cpu启动以来的时钟周期数
	now = sched_clock();
	//如果当前任务的运行时间大于等于最大平均睡眠时，则运行时间为最大平均睡眠时间
	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
		run_time = now - prev->timestamp;
		if (unlikely((long long)(now - prev->timestamp) < 0))
			run_time = 0;
	} else
		run_time = NS_MAX_SLEEP_AVG;

	/*
	 * Tasks charged proportionately less run_time at high sleep_avg to
	 * delay them losing their interactive status
	 */
	run_time /= (CURRENT_BONUS(prev) ? : 1);

	spin_lock_irq(&rq->lock);

	if (unlikely(prev->flags & PF_DEAD))
		prev->state = EXIT_DEAD;

	switch_count = &prev->nivcsw;
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
		switch_count = &prev->nvcsw;
		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
				unlikely(signal_pending(prev))))
			prev->state = TASK_RUNNING;
		else {
			if (prev->state == TASK_UNINTERRUPTIBLE)
				rq->nr_uninterruptible++;
			deactivate_task(prev, rq);
		}
	}
	//获取当前cpuid
	cpu = smp_processor_id();
	//如果当前cpu运行队列没有任务则进行负载均衡，从其它cpu的运行队列获取任务去执行
	if (unlikely(!rq->nr_running)) {
		//负载均衡调度
		idle_balance(cpu, rq);
		//如果负载均衡完还是没有任务，则还继续执行idel任务
		if (!rq->nr_running) {
			next = rq->idle;
			rq->expired_timestamp = 0;
			wake_sleeping_dependent(cpu);
			goto switch_tasks;
		}
	}

	//获取运行队列中的活动队列
	array = rq->active;
	if (unlikely(!array->nr_active)) {
		/*
		 * 如果活动队列没有任务了，则交换活动队列和过期队列的指针，避免进程饥饿
		 */
		schedstat_inc(rq, sched_switch);
		rq->active = rq->expired;
		rq->expired = array;
		array = rq->active;
		rq->expired_timestamp = 0;
		rq->best_expired_prio = MAX_PRIO;
	}

	//从活动队列中根据优先级找出第一个任务，复杂度为O(1)
	idx = sched_find_first_bit(array->bitmap);
	queue = array->queue + idx;
	next = list_entry(queue->next, struct task_struct, run_list);
	//优先级计算
	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
		unsigned long long delta = now - next->timestamp;
		if (unlikely((long long)(now - next->timestamp) < 0))
			delta = 0;

		if (next->sleep_type == SLEEP_INTERACTIVE)
			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

		array = next->array;
		new_prio = recalc_task_prio(next, next->timestamp + delta);

		if (unlikely(next->prio != new_prio)) {
			dequeue_task(next, array);
			next->prio = new_prio;
			enqueue_task(next, array);
		}
	}
	next->sleep_type = SLEEP_NORMAL;
	if (dependent_sleeper(cpu, rq, next))
		next = rq->idle;
switch_tasks:
	if (next == rq->idle)
		schedstat_inc(rq, sched_goidle);
	prefetch(next);
	prefetch_stack(next);
	clear_tsk_need_resched(prev);
	rcu_qsctr_inc(task_cpu(prev));

	update_cpu_clock(prev, rq, now);

	prev->sleep_avg -= run_time;
	if ((long)prev->sleep_avg <= 0)
		prev->sleep_avg = 0;
	prev->timestamp = prev->last_ran = now;
	//一些切换信息的计算
	sched_info_switch(prev, next);
	//如果将要运行的任务和当前任务不是同一个任务，则执行上下文切换
	if (likely(prev != next)) {
		next->timestamp = now; //下一个任务开始执行时间
		rq->nr_switches++;   //当前运行队列切换次数加一
		rq->curr = next;     //当前运行队列当前执行任务为将要执行的任务
		++*switch_count;     //切换次数加一

		//切换前的准备，和体系结构相关，很多cpu为空实现
		prepare_task_switch(rq, next);
		//进入真正的切换
		prev = context_switch(rq, prev, next);
		barrier();
		/*
		 * this_rq must be evaluated again because prev may have moved
		 * CPUs since it called schedule(), thus the 'rq' on its stack
		 * frame will be invalid.
		 */
		finish_task_switch(this_rq(), prev);
	} else
		spin_unlock_irq(&rq->lock);

	prev = current;
	if (unlikely(reacquire_kernel_lock(prev) < 0))
		goto need_resched_nonpreemptible;
	preempt_enable_no_resched();
	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
		goto need_resched;
}

可以看到前面大部分代码都是和调度算法相关的计算，最终的上下文切换函数是context_switch

/*
 * 上下文切换，切换mm，寄存器和内核栈
 */
static inline struct task_struct *
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next)
{
	struct mm_struct *mm = next->mm; //将要运行进程的mm内存描述符
	//当前进程的active_mm描述符，此处用active_mm是因为，如果当前进程是内核线程则mm为null
	//如过是用户态进程或线程则active_mm和mm相等，所以使用active_mm
	struct mm_struct *oldmm = prev->active_mm; 

	 //如果将要运行进程的mm描述符为null，说明此进程是内核线程，则直接复用前一个进程的mm
	 //描述符
	if (unlikely(!mm)) {  
		//将要运行的内核线程的active_mm值
		next->active_mm = oldmm;
		//oldmm引用加一
		atomic_inc(&oldmm->mm_count);
		//在多核系统下进入tlb懒惰模式，避免刷新tlb，这样速度更快，因为内核线程只需要使用
		//页表中的内核态部分，所以没必要刷新tlb，刷新tlb会导致缓存丢失，需要重新加载
		enter_lazy_tlb(oldmm, next);
	} else
		//如果是用户进程或线程则切换mm，也就是切换页表，在x86下就是重新装载CR3寄存器
		switch_mm(oldmm, mm, next);

	//如果当前进程是内核线程，则将activemm置为null
	if (unlikely(!prev->mm)) {
		prev->active_mm = NULL;
		WARN_ON(rq->prev_mm);
		rq->prev_mm = oldmm;
	}
	/*
	 * Since the runqueue lock will be released by the next
	 * task (which is an invalid locking op but in the case
	 * of the scheduler it's an obvious special-case), so we
	 * do an early lockdep release here:
	 */
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif

	/* 开始切换寄存器和内核栈 */
	switch_to(prev, next, prev);

	return prev;
}

context_switch首先切换进程的地址空间，这里面会对内核线程和普通进程线程做区别对待，在将要运行的进程是内核线程时，cpu的tlb会进入懒惰模式，因为内核线程只需要在内核空间运行，它可以借用前一个进程的mm，因为每个页表内核态部分都是一样的，所以tlb进入懒惰模式可以避免页表cache刷新造成大量的cache miss，可以提高速度。如果是用户态进程线程，则调用switch_mm进行地址空间切换。

/*
 *   切换mm，重新装载cr3页表指针
 */
static inline void switch_mm(struct mm_struct *prev,
			     struct mm_struct *next,
			     struct task_struct *tsk)
{
	int cpu = smp_processor_id();//获取当前cpu

	//如果当前mm和将要运行的进程的mm不相等，说明是进程的切换
	//单核cpu下，如果是同一个进程下的线程切换则不需要做任何处理
	if (likely(prev != next)) {  
		/* stop flush ipis for the previous mm */
		cpu_clear(cpu, prev->cpu_vm_mask); //清除cpuvm标记位
#ifdef CONFIG_SMP
		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; //当前cpu的tlb状态
		per_cpu(cpu_tlbstate, cpu).active_mm = next; //当前cpu的活动mm是将要运行进程的mm
#endif
		cpu_set(cpu, next->cpu_vm_mask); //设置新的cpuvm标志

		/* Re-load page tables */
		load_cr3(next->pgd);   //装载新的页表地址，到此cpu就开始在next进程下的地址空间执行了

		/*
		 * 如果局部描述符表不同，则装载新的局部描述符表
		 */
		if (unlikely(prev->context.ldt != next->context.ldt))
			load_LDT_nolock(&next->context, cpu);
	}
#ifdef CONFIG_SMP
	//在多核cpu下如果相等，则说明是同一个地址空间，说明是同一个进程里的线程切换
	else {
		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
		BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
		//如果next的cpuvm标志位没有置位，说明处于tlb懒惰模式，则需要重新装载
		//cr3
		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
			/* We were in lazy tlb mode and leave_mm disabled 
			 * tlb flush IPI delivery. We must reload %cr3.
			 */
			load_cr3(next->pgd);
			load_LDT_nolock(&next->context, cpu);
		}
	}
#endif
}

地址空间交换完了，下面就应该是寄存器和内核栈了。由于是寄存器和内核栈的交换所以主要得用汇编实现了。看switch_to宏。

#define switch_to(prev,next,last) do {					\
	unsigned long esi,edi;						\
	asm volatile("pushfl\n\t"		/* 保存prev的EFLAGS */	\
		     "pushl %%ebp\n\t"		/*保存prev 的ebp*/			\
		     "movl %%esp,%0\n\t"	/* 保存prev内核栈指针，保存到当前进程的thread结构 */ \
		     "movl %5,%%esp\n\t"	/* 将next进程的内核栈指针加载到esp，完成内核栈切换 */\
		     "movl $1f,%1\n\t"		/* 将prev下一次被调度运行的地址保存到thered的eip */\
		     "pushl %6\n\t"		/* 将next将要运行的地址压栈*/	\
		     "jmp __switch_to\n"/*短调用__switch_to,从__switch_to返回的时候直接返回到%6*/\
		     "1:\t"						\
		     "popl %%ebp\n\t"					\
		     "popfl"						\
		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
		      "=a" (last),"=S" (esi),"=D" (edi)			\
		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
		      "2" (prev), "d" (next));				\
} while (0)

核心就是prev保存它的eflags，ebp，eip，esp。然后切换到next内核栈，至此prev进程就被挂起来了，把next上次被挂起的地址压栈，调用__switch_to函数，__switch_to函数返回的时候会直接跳到标号1或者ret_from_fork。这里有两种情况1、当一个进程是新创建的，首次运行，则会跳到ret_from_fork汇编例程。2、当一个进程是被挂起，又被重新调度的，则跳转到标号1。

再看__switch_to函数

//采用fastcall调用，则不使用栈传参，使用通用寄存器传递参数
struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
	struct thread_struct *prev = &prev_p->thread,
				 *next = &next_p->thread;
	int cpu = smp_processor_id();//获取当前cpu
	//获取tss段
	struct tss_struct *tss = &per_cpu(init_tss, cpu);

	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
	//浮点寄存器相关
	__unlazy_fpu(prev_p);

	/*
	 * 重新装载esp0
	 */
	load_esp0(tss, next);

	//保存prev段寄存器
	savesegment(fs, prev->fs);
	savesegment(gs, prev->gs);

	/*
	 * 装载每线程的线程本地描述符
	 */
	load_TLS(next, cpu);

	/*
	 *装载新的fs gs
	 */
	if (unlikely(prev->fs | next->fs))
		loadsegment(fs, next->fs);

	if (prev->gs | next->gs)
		loadsegment(gs, next->gs);

	/*
	 * 如果prev和next的IO特权级不一样则需要重新装载
	 */
	if (unlikely(prev->iopl != next->iopl))
		set_iopl_mask(next->iopl);

	/*
	 * 处理debug寄存器和IO位图
	 */
	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
		__switch_to_xtra(next_p, tss);

	disable_tsc(prev_p, next_p);

	//返回prev_p，这个prev_p是用eax返回的，如果next进程是新进程从未运行过，则返回到
	//ret_from_fork,如果不是，则返回到switch_to的标号1
	return prev_p;
}

这个函数主要是保存和恢复一些寄存器。其中看返回的prev_p是用eax返回的，shcedule函数中调用context_switch函数的语句是prev = context_switch(rq, prev, next);这个prev就是prev_p，比如进程A切换到进程B时，此时context_switch未返回前prev为A，再由进程B切换到进程C，再由进程C切换到A时，context_switch返回,此时prev为C，所以context_switch返回的值就是当前进程前一个进程的task_strcuct。

从整体看流程