前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Linux 进程管理

Linux 进程管理

作者头像
一只小虾米
发布2023-03-09 21:31:19
10.1K0
发布2023-03-09 21:31:19
举报
文章被收录于专栏:Android点滴分享Android点滴分享

本篇介绍

本篇介绍下Linux 中进程管理相关的内容,包括进程状态,切换等。

内容介绍

在内核层面,每个进程都是由task_struct 描述的,这个结构体非常大,可以粗略看下各主要内容:

代码语言:javascript
复制
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /*
     * For reasons of header soup (see current_thread_info()), this
     * must be the first element of task_struct.
     */
    struct thread_info      thread_info;
#endif
    unsigned int            __state;

#ifdef CONFIG_PREEMPT_RT
    /* saved state for "spinlock sleepers" */
    unsigned int            saved_state;
#endif
    /*
     * This begins the randomizable portion of task_struct. Only
     * scheduling-critical items should be added above here.
     */
    randomized_struct_fields_start

    void                *stack;
    refcount_t          usage;

// 调度相关
    int             on_rq;

    int             prio;
    int             static_prio;
    int             normal_prio;
    unsigned int            rt_priority;

    struct sched_entity     se;
    struct sched_rt_entity      rt;
    struct sched_dl_entity      dl;
    const struct sched_class    *sched_class;
// 内存相关
    struct mm_struct        *mm;
    struct mm_struct        *active_mm;
//  进程组织相关
pid_t               pid;
    pid_t               tgid;

#ifdef CONFIG_STACKPROTECTOR
    /* Canary value for the -fstack-protector GCC feature: */
    unsigned long           stack_canary;
#endif
    /*
     * Pointers to the (original) parent process, youngest child, younger sibling,
     * older sibling, respectively.  (p->father can be replaced with
     * p->real_parent->pid)
     */

    /* Real parent process: */
    struct task_struct __rcu    *real_parent;

    /* Recipient of SIGCHLD, wait4() reports: */
    struct task_struct __rcu    *parent;

    /*
     * Children/sibling form the list of natural children:
     */
    struct list_head        children;
    struct list_head        sibling;
    struct task_struct      *group_leader;
// 文件相关
    /* Filesystem information: */
    struct fs_struct        *fs;

    /* Open file information: */
    struct files_struct     *files;
// 信号相关
    /* Signal handlers: */
    struct signal_struct        *signal;
    struct sighand_struct __rcu     *sighand;
    sigset_t            blocked;
    sigset_t            real_blocked;
    /* Restored if set_restore_sigmask() was used: */
    sigset_t            saved_sigmask;
    struct sigpending       pending;

内核线程

内核线程其实就是运行在内核态的进程,只是没有进程地址空间,因此只能运行在内核地址空间中,创建方法如下:

代码语言:javascript
复制
/**
 * kthread_create - create a kthread on the current node
 * @threadfn: the function to run in the thread
 * @data: data pointer for @threadfn()
 * @namefmt: printf-style format string for the thread name
 * @arg: arguments for @namefmt.
 *
 * This macro will create a kthread on the current node, leaving it in
 * the stopped state.  This is just a helper for kthread_create_on_node();
 * see the documentation there for more details.
 */
#define kthread_create(threadfn, data, namefmt, arg...) \
    kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)

/**
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create() followed by
 * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
 */
#define kthread_run(threadfn, data, namefmt, ...)              \
({                                     \
    struct task_struct *__k                        \
        = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
    if (!IS_ERR(__k))                          \
        wake_up_process(__k);                      \
    __k;                                   \
})

进程创建

用户态可以通过fork,vfork,clone来创建新的进程,在内核中的实现统一都是do_fork, do_fork内部的实现又是copy_process,大致流程如下:

代码语言:javascript
复制
/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
                    struct pid *pid,
                    int trace,
                    int node,
                    struct kernel_clone_args *args) {
...
/*
     * Force any signals received before this point to be delivered
     * before the fork happens.  Collect up signals sent to multiple
     * processes that happen during the fork and delay them so that
     * they appear to happen after the fork.
     */
    sigemptyset(&delayed.signal);
    INIT_HLIST_NODE(&delayed.node);

    spin_lock_irq(&current->sighand->siglock);
    if (!(clone_flags & CLONE_THREAD))
        hlist_add_head(&delayed.node, &current->signal->multiprocess);
    recalc_sigpending();
    spin_unlock_irq(&current->sighand->siglock);
    retval = -ERESTARTNOINTR;
    if (task_sigpending(current))
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current, node);
    if (!p)
        goto fork_out;
    p->flags &= ~PF_KTHREAD;
    if (args->kthread)
        p->flags |= PF_KTHREAD;
    if (args->io_thread) {
        /*
         * Mark us an IO worker, and block any signal that isn't
         * fatal or STOP
         */
        p->flags |= PF_IO_WORKER;
        siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
    }
...
    /* Perform scheduler related setup. Assign this task to a CPU. */
    retval = sched_fork(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_policy;

    retval = perf_event_init_task(p, clone_flags);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_perf;
    /* copy all the process information */
    shm_init_task(p);
    retval = security_task_alloc(p, clone_flags);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_security;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(p, args);
    if (retval)
        goto bad_fork_cleanup_io;

    stackleak_task_init(p);

    if (pid != &init_struct_pid) {
        pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                args->set_tid_size);
        if (IS_ERR(pid)) {
            retval = PTR_ERR(pid);
            goto bad_fork_cleanup_thread;
        }
    }

    /*
     * This has to happen after we've potentially unshared the file
     * descriptor table (so that the pidfd doesn't leak into the child
     * if the fd table isn't shared).
     */
    if (clone_flags & CLONE_PIDFD) {
        retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (retval < 0)
            goto bad_fork_free_pid;

        pidfd = retval;

        pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
                          O_RDWR | O_CLOEXEC);
        if (IS_ERR(pidfile)) {
            put_unused_fd(pidfd);
            retval = PTR_ERR(pidfile);
            goto bad_fork_free_pid;
        }
        get_pid(pid);   /* held by pidfile now */

        retval = put_user(pidfd, args->pidfd);
        if (retval)
            goto bad_fork_put_pidfd;
    }

#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
    futex_init_task(p);

    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        sas_ss_reset(p);

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
    clear_task_syscall_work(p, SYSCALL_EMU);
#endif
    clear_tsk_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    p->pdeath_signal = 0;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;
    clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
    p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
    p->rethooks.first = NULL;
#endif

    /*
     * Ensure that the cgroup subsystem policies allow the new process to be
     * forked. It should be noted that the new process's css_set can be changed
     * between here and cgroup_post_fork() if an organisation operation is in
     * progress.
     */
    retval = cgroup_can_fork(p, args);
    if (retval)
        goto bad_fork_put_pidfd;

    /*
     * Now that the cgroups are pinned, re-clone the parent cgroup and put
     * the new task on the correct runqueue. All this *before* the task
     * becomes visible.
     *
     * This isn't part of ->can_fork() because while the re-cloning is
     * cgroup specific, it unconditionally needs to place the task on a
     * runqueue.
     */
    sched_cgroup_fork(p, args);

    /*
     * From this point on we must avoid any synchronous user-space
     * communication until we take the tasklist-lock. In particular, we do
     * not want user-space to be able to predict the process start-time by
     * stalling fork(2) after we recorded the start_time but before it is
     * visible to the system.
     */

    p->start_time = ktime_get_ns();
    p->start_boottime = ktime_get_boottime_ns();

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
        if (clone_flags & CLONE_THREAD)
            p->exit_signal = -1;
        else
            p->exit_signal = current->group_leader->exit_signal;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
        p->exit_signal = args->exit_signal;
    }

    klp_copy_process(p);

    sched_core_fork(p);

    spin_lock(&current->sighand->siglock);

    rv_task_fork(p);

    rseq_fork(p, clone_flags);

    /* Don't start children in a dying pid namespace */
    if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
        retval = -ENOMEM;
        goto bad_fork_cancel_cgroup;
    }

    /* Let kill terminate clone/fork in the middle */
    if (fatal_signal_pending(current)) {
        retval = -EINTR;
        goto bad_fork_cancel_cgroup;
    }

    /* No more failure paths after this point. */

    /*
     * Copy seccomp details explicitly here, in case they were changed
     * before holding sighand lock.
     */
    copy_seccomp(p);

    init_task_pid_links(p);
    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_TGID, pid);
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }
            p->signal->shared_pending.signal = delayed.signal;
            p->signal->tty = tty_kref_get(current->signal->tty);
            /*
             * Inherit has_child_subreaper flag under the same
             * tasklist_lock with adding child to the process tree
             * for propagate_has_child_subreaper optimization.
             */
            p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                             p->real_parent->signal->is_child_subreaper;
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_TGID);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {
            current->signal->nr_threads++;
            current->signal->quick_threads++;
            atomic_inc(&current->signal->live);
            refcount_inc(&current->signal->sigcnt);
            task_join_group_stop(p);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
            list_add_tail_rcu(&p->thread_node,
                      &p->signal->thread_head);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads++;
    }
    total_forks++;
    hlist_del_init(&delayed.node);
    spin_unlock(&current->sighand->siglock);
    syscall_tracepoint_update(p);
    write_unlock_irq(&tasklist_lock);

    if (pidfile)
        fd_install(pidfd, pidfile);

    proc_fork_connector(p);
    sched_post_fork(p);
    cgroup_post_fork(p, args);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);
    uprobe_copy_process(p, clone_flags);

    copy_oom_score_adj(clone_flags, p);
...
}

进程调度

nice值: 范围是-20 ~ 19, 值越大,优先级越低,也就是对其他进程越‘’nice‘’。 在内核的task_struct结构中有几个字段是涉及到优先级:

代码语言:javascript
复制
    int             prio; // 进程的动态优先级,也是调度类考虑使用的优先级
    int             static_prio; // 静态优先级,进程启动时分配,可以通过nice或sched_setscheduler 等系统调用修改
    int             normal_prio;// 根据static_prio和调度策略计算出来的优先级
    unsigned int            rt_priority;// 实时进程的优先级

调度策略

目前Linux内核中默认实现了5个调度类,分别如下: stop:最高优先级,可以抢占任何进程 deadline:调度策略是SCHED_DEADLINE,用于调度有严格时间要求的实时进程 realtime:SCHED_FILO,SCHED_RR, 用于普通的实时进程 CFS:SCHED_NORMAL,SCHED_BATCH,SCHED_IDEL,普通进程,由CFS来调度 idel:用于最低优先级的进程

在POSIX中也有对应的api可以设置调度策略及优先级:

代码语言:javascript
复制
int sched_setscheduler(pid_t pid, int policy, const struct sched_param *param)
int sched_getscheduler(pid_t pid)
int sehed_setparam(pid_t pid, const struct sched_param *param)
int sched_getparam(pid_t pid, struct sched_param *param)

CFS算法

谈到进程调度首先能想到的方法就是时间片等经典算法,实际上当前主流的调度算法是CFS,CFS中引入了虚拟时间和真实时间的概念,在调度的时候选择运行虚拟时间最短的进程,可以简单理解成虚拟时间等于真实时间与权重的比值,这儿的权重是以nice 0权重作为基准,如果nice值越小,权重大,这时候虚拟时间就小于真实时间,那么运行时间就变长,如果nice值大这时候权重值就会变小,虚拟时间就会大于时间时间,运行时间就会变短。接下来详细看看: nice值到权重值的转换如下:

代码语言:javascript
复制
/*
 * Nice levels are multiplicative, with a gentle 10% change for every
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 * nice 1, it will get ~10% less CPU time than another CPU-bound task
 * that remained on nice 0.
 *
 * The "10% effect" is relative and cumulative: from _any_ nice level,
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 * If a task goes up by ~10% and another task goes down by ~10% then
 * the relative distance between them is ~25%.)
 */
const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

权重值是0对应的是权重是1024,相邻nice值的权重值差异是1.25,也就是nice值减1,CPU时间会增加10%。 再看下另外一个表:

代码语言:javascript
复制
/*
 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
 *
 * In cases where the weight does not change often, we can use the
 * precalculated inverse to speed up arithmetics by turning divisions
 * into multiplications:
 */
const u32 sched_prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

这个表可以看成是sched_prio_to_weight的倒数,为啥又需要倒数呢?这就需要看看虚拟时间的计算方法了:

代码语言:javascript
复制
struct load_weight {
    unsigned long           weight; // 权重
    u32             inv_weight; // 权重的倒数 inv_weight=2^32/weight
};

struct sched_entity {
    /* For load-balancing: */
    struct load_weight      load;
    struct rb_node          run_node;
    struct list_head        group_node;
    unsigned int            on_rq;
...
}

static void set_load_weight(struct task_struct *p, bool update_load)
{
    int prio = p->static_prio - MAX_RT_PRIO;
    struct load_weight *load = &p->se.load;

    /*
     * SCHED_IDLE tasks get minimal weight:
     */
    if (task_has_idle_policy(p)) {
        load->weight = scale_load(WEIGHT_IDLEPRIO);
        load->inv_weight = WMULT_IDLEPRIO;
        return;
    }

    /*
     * SCHED_OTHER tasks have to update their load when changing their
     * weight
     */
    if (update_load && p->sched_class == &fair_sched_class) {
        reweight_task(p, prio);
    } else {
        load->weight = scale_load(sched_prio_to_weight[prio]);
        load->inv_weight = sched_prio_to_wmult[prio];
    }
}

还有一个公式: vruntime=delta_exec*nice_0_weight/weight delta_exec表示真实运行时间 看到这个公式就可以看到sched_prio_to_wmult的意义了,就是为了加快计算。 调度类也有面向对象的里面,如果要实现一个调度器,实现调度需要的接口方法就好了,如下所示:

代码语言:javascript
复制
struct sched_class {

#ifdef CONFIG_UCLAMP_TASK
    int uclamp_enabled;
#endif

    void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*yield_task)   (struct rq *rq);
    bool (*yield_to_task)(struct rq *rq, struct task_struct *p);

    void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);

    struct task_struct *(*pick_next_task)(struct rq *rq);

    void (*put_prev_task)(struct rq *rq, struct task_struct *p);
    void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);

#ifdef CONFIG_SMP
    int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
    int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);

    struct task_struct * (*pick_task)(struct rq *rq);

    void (*migrate_task_rq)(struct task_struct *p, int new_cpu);

    void (*task_woken)(struct rq *this_rq, struct task_struct *task);

    void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);

    void (*rq_online)(struct rq *rq);
    void (*rq_offline)(struct rq *rq);

    struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif

    void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
    void (*task_fork)(struct task_struct *p);
    void (*task_dead)(struct task_struct *p);

    /*
     * The switched_from() call is allowed to drop rq->lock, therefore we
     * cannot assume the switched_from/switched_to pair is serialized by
     * rq->lock. They are however serialized by p->pi_lock.
     */
    void (*switched_from)(struct rq *this_rq, struct task_struct *task);
    void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
    void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
                  int oldprio);

    unsigned int (*get_rr_interval)(struct rq *rq,
                    struct task_struct *task);

    void (*update_curr)(struct rq *rq);

#ifdef CONFIG_FAIR_GROUP_SCHED
    void (*task_change_group)(struct task_struct *p);
#endif
};

进程调度

进程调度的API如下:

代码语言:javascript
复制
extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);

schedule做的主要事情是选择下一个进程,然后进行上下文切换。

代码语言:javascript
复制
/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next, struct rq_flags *rf)
{
    prepare_task_switch(rq, prev, next);

    /*
     * For paravirt, this is coupled with an exit in switch_to to
     * combine the page table reload and the switch backend into
     * one hypercall.
     */
    arch_start_context_switch(prev);

    /*
     * kernel -> kernel   lazy + transfer active
     *   user -> kernel   lazy + mmgrab() active
     *
     * kernel ->   user   switch + mmdrop() active
     *   user ->   user   switch
     */
    if (!next->mm) {                                // to kernel
        enter_lazy_tlb(prev->active_mm, next);

        next->active_mm = prev->active_mm;
        if (prev->mm)                           // from user
            mmgrab(prev->active_mm);
        else
            prev->active_mm = NULL;
    } else {                                        // to user
        membarrier_switch_mm(rq, prev->active_mm, next->mm);
        /*
         * sys_membarrier() requires an smp_mb() between setting
         * rq->curr / membarrier_switch_mm() and returning to userspace.
         *
         * The below provides this either through switch_mm(), or in
         * case 'prev->active_mm == next->mm' through
         * finish_task_switch()'s mmdrop().
         */
        switch_mm_irqs_off(prev->active_mm, next->mm, next);
        lru_gen_use_mm(next->mm);

        if (!prev->mm) {                        // from kernel
            /* will mmdrop() in finish_task_switch(). */
            rq->prev_mm = prev->active_mm;
            prev->active_mm = NULL;
        }
    }

    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);

    prepare_lock_switch(rq, next, rf);

    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);
    barrier();

    return finish_task_switch(prev);
}

也就是将新进程的页表基地址加载到页表基地址寄存器中,也负责切换硬件上下文,比如刷新TLB。

简单提下TLB, 用于作为线性地址换成物理地址的cache,毕竟每次通过mmu转换地址相比起直接获取到物理地址还是多出不少操作。

接下来看下switch_to方法:

代码语言:javascript
复制
/*
 * For newly created kernel threads switch_to() will return to
 * ret_from_kernel_thread, newly created user threads to ret_from_fork.
 * That is, everything following resume() will be skipped for new threads.
 * So everything that matters to new threads should be placed before resume().
 */
#define switch_to(prev, next, last)                 \
do {                                    \
    __mips_mt_fpaff_switch_to(prev);                \
    lose_fpu_inatomic(1, prev);                 \
    if (tsk_used_math(next))                    \
        __sanitize_fcr31(next);                 \
    if (cpu_has_dsp) {                      \
        __save_dsp(prev);                   \
        __restore_dsp(next);                    \
    }                               \
    if (cop2_present) {                     \
        u32 status = read_c0_status();              \
                                    \
        set_c0_status(ST0_CU2);                 \
        if ((KSTK_STATUS(prev) & ST0_CU2)) {            \
            if (cop2_lazy_restore)              \
                KSTK_STATUS(prev) &= ~ST0_CU2;      \
            cop2_save(prev);                \
        }                           \
        if (KSTK_STATUS(next) & ST0_CU2 &&          \
            !cop2_lazy_restore) {               \
            cop2_restore(next);             \
        }                           \
        write_c0_status(status);                \
    }                               \
    __clear_r5_hw_ll_bit();                     \
    __clear_software_ll_bit();                  \
    if (cpu_has_userlocal)                      \
        write_c0_userlocal(task_thread_info(next)->tp_value);   \
    __restore_watch(next);                      \
    (last) = resume(prev, next, task_thread_info(next));        \
} while (0)

这是负责栈空间切换的,看到这儿会有2个疑问:

  1. 切换似乎只需要两个参数就够了,一个表示当前进程,一个表示下一个进程,也就是prev,next,为什么还需要last作为第三个参数呢?
  2. context_switch在执行完switch_to后还有一段代码,是会被谁来执行呢?比如finish_task_switch

能搞明白这两个问题基本进程切换就清晰了,我们慢慢捋一下,比如现在有A,B两个进程,A表示当前的进程,现在A要进行切换了,选择的下一个进程是B,那么就开始执行switch_to, 此时prev是A,next是B,执行完后CPU就会切换B的进程上下文执行,这时候switch_to之后的代码就不会被执行了,因为CPU进行加载了B的硬件上下文,寄存器中已经都是进程B的上下文信息了,就执行进程B的指令了,那switch_to后的代码啥时候会被执行呢?比如说过了一会儿,另外一个CPU上的某个进程X也准备切换了,然后选择的目标进程是A,开始执行swtich_to,那这时候prev是X,next是A,执行完后就会切换到进程A的上下文中,这时候CPU加载进A的硬件上下文,而原先swtich_to还没执行的指令地址就在进程A的上下文中保存着,接下来就会在进程A中执行swtich_to后的代码,在执行A的指令前需要帮prev进程做一个清理操作,这时候就是prev的用处了,也就是swtich_to之所以需要第三个参数,是因为需要知道切换到当前进程的前一个进程信息,而前一个进程又不一定是当前切换的目标进程,因此就需要用第三个参数传递。接下来再看下finish_task_switch究竟清理什么内容了:

代码语言:javascript
复制
/**
 * finish_task_switch - clean up after a task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
 * with a prepare_task_switch call before the context switch.
 * finish_task_switch will reconcile locking set up by prepare_task_switch,
 * and do any other architecture-specific cleanup actions.
 *
 * Note that we may have delayed dropping an mm in context_switch(). If
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
 *
 * The context switch have flipped the stack from under us and restored the
 * local variables which were saved when this task called schedule() in the
 * past. prev == current is still correct but we need to recalculate this_rq
 * because prev may have moved to another CPU.
 */
static struct rq *finish_task_switch(struct task_struct *prev)
    __releases(rq->lock)
{
    struct rq *rq = this_rq();
    struct mm_struct *mm = rq->prev_mm;
    unsigned int prev_state;

    /*
     * The previous task will have left us with a preempt_count of 2
     * because it left us after:
     *
     *  schedule()
     *    preempt_disable();            // 1
     *    __schedule()
     *      raw_spin_lock_irq(&rq->lock)    // 2
     *
     * Also, see FORK_PREEMPT_COUNT.
     */
    if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
              "corrupted preempt_count: %s/%d/0x%x\n",
              current->comm, current->pid, preempt_count()))
        preempt_count_set(FORK_PREEMPT_COUNT);

    rq->prev_mm = NULL;

    /*
     * A task struct has one reference for the use as "current".
     * If a task dies, then it sets TASK_DEAD in tsk->state and calls
     * schedule one last time. The schedule call will never return, and
     * the scheduled task must drop that reference.
     *
     * We must observe prev->state before clearing prev->on_cpu (in
     * finish_task), otherwise a concurrent wakeup can get prev
     * running on another CPU and we could rave with its RUNNING -> DEAD
     * transition, resulting in a double drop.
     */
    prev_state = READ_ONCE(prev->__state);
    vtime_task_switch(prev);
    perf_event_task_sched_in(prev, current);
    finish_task(prev);
    tick_nohz_task_switch();
    finish_lock_switch(rq);
    finish_arch_post_lock_switch();
    kcov_finish_switch(current);
    /*
     * kmap_local_sched_out() is invoked with rq::lock held and
     * interrupts disabled. There is no requirement for that, but the
     * sched out code does not have an interrupt enabled section.
     * Restoring the maps on sched in does not require interrupts being
     * disabled either.
     */
    kmap_local_sched_in();

    fire_sched_in_preempt_notifiers(current);
    /*
     * When switching through a kernel thread, the loop in
     * membarrier_{private,global}_expedited() may have observed that
     * kernel thread and not issued an IPI. It is therefore possible to
     * schedule between user->kernel->user threads without passing though
     * switch_mm(). Membarrier requires a barrier after storing to
     * rq->curr, before returning to userspace, so provide them here:
     *
     * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
     *   provided by mmdrop(),
     * - a sync_core for SYNC_CORE.
     */
    if (mm) {
        membarrier_mm_sync_core_before_usermode(mm);
        mmdrop_sched(mm);
    }
    if (unlikely(prev_state == TASK_DEAD)) {
        if (prev->sched_class->task_dead)
            prev->sched_class->task_dead(prev);

        /* Task is done with its stack. */
        put_task_stack(prev);

        put_task_struct_rcu_user(prev);
    }

    return rq;
}

这儿再抛一个问题,进程和线程在内核态的表示是否都是task_struct? 答案是是的,曾经在内核中看到一个结构体thread_struct,有段时间以为进程是task_struct, 线程是thread_struct, 后来随着阅历的丰富,也渐渐清晰起来了,进程和线程都是task_struct, 无非是线程的mm是共享的。那什么是thread_struct呢?接下来看下:

代码语言:javascript
复制
struct thread_struct {
    struct cpu_context  cpu_context;    /* cpu context */

    /*
     * Whitelisted fields for hardened usercopy:
     * Maintainers must ensure manually that this contains no
     * implicit padding.
     */
    struct {
        unsigned long   tp_value;   /* TLS register */
        unsigned long   tp2_value;
        struct user_fpsimd_state fpsimd_state;
    } uw;

    enum fp_type        fp_type;    /* registers FPSIMD or SVE? */
    unsigned int        fpsimd_cpu;
    void            *sve_state; /* SVE registers, if any */
    void            *za_state;  /* ZA register, if any */
    unsigned int        vl[ARM64_VEC_MAX];  /* vector length */
    unsigned int        vl_onexec[ARM64_VEC_MAX]; /* vl after next exec */
    unsigned long       fault_address;  /* fault info */
    unsigned long       fault_code; /* ESR_EL1 value */
    struct debug_info   debug;      /* debugging */
#ifdef CONFIG_ARM64_PTR_AUTH
    struct ptrauth_keys_user    keys_user;
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
    struct ptrauth_keys_kernel  keys_kernel;
#endif
#endif
#ifdef CONFIG_ARM64_MTE
    u64         mte_ctrl;
#endif
    u64         sctlr_user;
    u64         svcr;
    u64         tpidr2_el0;
};

cpu_context内容如下:

代码语言:javascript
复制
struct cpu_context {
    unsigned long x19;
    unsigned long x20;
    unsigned long x21;
    unsigned long x22;
    unsigned long x23;
    unsigned long x24;
    unsigned long x25;
    unsigned long x26;
    unsigned long x27;
    unsigned long x28;
    unsigned long fp;
    unsigned long sp;
    unsigned long pc;
};

这个就是进程硬件上下文信息,这时候就清楚了吧?在进程切换的时候,就是把寄存器信息保存到这个结构里,等切换回来的时候,再把这个结构的信息加载到寄存器里,就可以接着运行了。

多核调度

SMP结构的多核处理器比较常见,结构如下:

image.png

linux使用sched_domain数据结构描述调度层级,使用sched_group描述调度组,调度组是负载均衡调度的最小单位。

负载均衡

如何衡量一个CPU的负载呢?最简单能想到的就是CPU上就绪进程的权重之和,这样是不准的,没有考虑到进程占用CPU的方式,比如有的是io密集型,有的是计算密集型,改进下的方法是: CPU上的负载=(运行时间/总时间)*就绪队列总权重

SMP负载均衡机制从注册软中断开始,系统每次调用tick中断都会检查是否需要处理SMP负载均衡,rebalance_domains是入口:

代码语言:javascript
复制
/*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
 * Balancing parameters are set up in init_sched_domains.
 */
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
    int continue_balancing = 1;
    int cpu = rq->cpu;
    int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
    unsigned long interval;
    struct sched_domain *sd;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;
    int need_serialize, need_decay = 0;
    u64 max_cost = 0;

    rcu_read_lock();
    for_each_domain(cpu, sd) {
        /*
         * Decay the newidle max times here because this is a regular
         * visit to all the domains.
         */
        need_decay = update_newidle_cost(sd, 0);
        max_cost += sd->max_newidle_lb_cost;

        /*
         * Stop the load balance at this level. There is another
         * CPU in our sched group which is doing load balancing more
         * actively.
         */
        if (!continue_balancing) {
            if (need_decay)
                continue;
            break;
        }

        interval = get_sd_balance_interval(sd, busy);

        need_serialize = sd->flags & SD_SERIALIZE;
        if (need_serialize) {
            if (!spin_trylock(&balancing))
                goto out;
        }

        if (time_after_eq(jiffies, sd->last_balance + interval)) {
            if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                /*
                 * The LBF_DST_PINNED logic could have changed
                 * env->dst_cpu, so we can't know our idle
                 * state even if we migrated tasks. Update it.
                 */
                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
            }
            sd->last_balance = jiffies;
            interval = get_sd_balance_interval(sd, busy);
        }
        if (need_serialize)
            spin_unlock(&balancing);
out:
        if (time_after(next_balance, sd->last_balance + interval)) {
            next_balance = sd->last_balance + interval;
            update_next_balance = 1;
        }
    }
    if (need_decay) {
        /*
         * Ensure the rq-wide value also decays but keep it at a
         * reasonable floor to avoid funnies with rq->avg_idle.
         */
        rq->max_idle_balance_cost =
            max((u64)sysctl_sched_migration_cost, max_cost);
    }
    rcu_read_unlock();

    /*
     * next_balance will be updated only when there is a need.
     * When the cpu is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))
        rq->next_balance = next_balance;

}

本质上就是从当前CPU开始自下而上便利调度域,如果当前CPU空闲,那么找到最繁忙的调度组,然后迁移到当前CPU上。

本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2023-03-05,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 本篇介绍
  • 内容介绍
    • 内核线程
      • 进程创建
      • 进程调度
        • 调度策略
          • CFS算法
            • 进程调度
              • 多核调度
                • 负载均衡
                相关产品与服务
                文件存储
                文件存储(Cloud File Storage,CFS)为您提供安全可靠、可扩展的共享文件存储服务。文件存储可与腾讯云服务器、容器服务、批量计算等服务搭配使用,为多个计算节点提供容量和性能可弹性扩展的高性能共享存储。腾讯云文件存储的管理界面简单、易使用,可实现对现有应用的无缝集成;按实际用量付费,为您节约成本,简化 IT 运维工作。
                领券
                问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档