Linux kernel Namespace源码分析

学习一下linux kernel namespace的代码还是很有必要的,让你对docker容器的namespace隔离有更深的认识。我的源码分析,是基于Linux Kernel 4.4.19 (https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.4.19.gz)版本的,由于namespace模块更新很少,因此其他相近版本之间雷同。User namespace由于与其他namespaces耦合在一起,比较难分析,我将在后续再作分析。

Kernel,Namespace,Process

Linux Namespace是一种Linux Kernel提供的资源隔离方案,提供Pid,Network,Ipc,Uts,Mount等资源的隔离,每个Namespace下的这些资源对于其他Namespace是不可见的。 注意,一个进程可以同时属于多个Namespace。Linux Kernel、Namespace、Process之间的关系可以用下图描述。

Begin with “task_struct”

As u know, Linux Namespace是用来做进程资源隔离的,那么在进程描述符中,一定有对应的Namespaces Info。 在linux-4.4.19/include/linux/sched.h #1380 定义task_struct结构体,该结构体是Linux Process完整信息的集合,其中就包含了一个指向Namespace结构体的指针nsproxy。

struct task_struct {
      ...
      /* namespaces */
      struct nsproxy *nsproxy;
      ...
}

nsproxy结构体的定义在linux-4.4.6/include/linux/nsproxy.h #29

/*
 * A structure to contain pointers to all per-process
 * namespaces - fs (mount), uts, network, sysvipc, etc.
 *
 * The pid namespace is an exception -- it's accessed using
 * task_active_pid_ns.  The pid namespace here is the
 * namespace that children will use.
 *
 * 'count' is the number of tasks holding a reference.
 * The count for each namespace, then, will be the number
 * of nsproxies pointing to it, not the number of tasks.
 *
 * The nsproxy is shared by tasks which share all namespaces.
 * As soon as a single namespace is cloned or unshared, the
 * nsproxy is copied.
 */
struct nsproxy {
      atomic_t count;
      struct uts_namespace *uts_ns;
      struct ipc_namespace *ipc_ns;
      struct mnt_namespace *mnt_ns;
      struct pid_namespace *pid_ns_for_children;
      struct net         *net_ns;
};

注意:正如如上代码注释写到,只要namespace被clone了,那么nsproxy就会跟着被clone。 同时,nsproxy.h中定义了一些对namespace的操作,包括copy_namespaces等。

 int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns) { … }
static inline void get_nsproxy(struct nsproxy *ns) { … }

###uts_namespace linux-4.4.19/include/linux/utsname.h #12

struct uts_namespace {
       struct kref kref;
       struct new_utsname name;
       struct user_namespace *user_ns;
       struct ns_common ns;
};

###ipc_namespace linux-4.4.19/include/linux/ipc_namespace.h #21

struct ipc_namespace {
       atomic_t      count;
       struct ipc_ids      ids[3];
 
 
       int          sem_ctls[4];
       int          used_sems;
 
 
       unsigned int msg_ctlmax;
       unsigned int msg_ctlmnb;
       unsigned int msg_ctlmni;
       atomic_t      msg_bytes;
       atomic_t      msg_hdrs;
 
 
       size_t           shm_ctlmax;
       size_t           shm_ctlall;
       unsigned long     shm_tot;
       int          shm_ctlmni;
       /*
        * Defines whether IPC_RMID is forced for _all_ shm segments regardless
        * of shmctl()
        */
       int          shm_rmid_forced;
 
 
       struct notifier_block ipcns_nb;
 
 
       /* The kern_mount of the mqueuefs sb.  We take a ref on it */
       struct vfsmount  *mq_mnt;
 
 
       /* # queues in this ns, protected by mq_lock */
       unsigned int    mq_queues_count;
 
 
       /* next fields are set through sysctl */
       unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
       unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
       unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
       unsigned int    mq_msg_default;
       unsigned int    mq_msgsize_default;
 
 
       /* user_ns which owns the ipc ns */
       struct user_namespace *user_ns;
 
 
       struct ns_common ns;
};

###mnt_namespace linux-4.4.19/fs/mount.h #7

struct mnt_namespace {
       atomic_t             count;
       struct ns_common     ns;
       struct mount *    root;
       struct list_head   list;
       struct user_namespace    *user_ns;
       u64               seq; /* Sequence number to prevent loops */
       wait_queue_head_t poll;
       u64 event;
};

###pid_namespace linux-4.4.19/include/linux/pid_namespace.h #24

struct pid_namespace {
       struct kref kref;
       struct pidmap pidmap[PIDMAP_ENTRIES];
       struct rcu_head rcu;
       int last_pid;
       unsigned int nr_hashed;
       struct task_struct *child_reaper;
       struct kmem_cache *pid_cachep;
       unsigned int level;
       struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS
       struct vfsmount *proc_mnt;
       struct dentry *proc_self;
       struct dentry *proc_thread_self;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
       struct fs_pin *bacct;
#endif
       struct user_namespace *user_ns;
       struct work_struct proc_work;
       kgid_t pid_gid;
       int hide_pid;
       int reboot;    /* group exit code if this pidns was rebooted */
       struct ns_common ns;
};

###net_namespace linux-4.4.19/include/net/net_namespace.h #47

struct net {
       atomic_t             passive; /* To decided when the network
                                           * namespace should be freed.
                                           */
       atomic_t             count;           /* To decided when the network
                                           *  namespace should be shut down.
                                           */
       spinlock_t            rules_mod_lock;
 
 
       atomic64_t         cookie_gen;
 
 
       struct list_head   list;        /* list of network namespaces */
       struct list_head   cleanup_list; /* namespaces on death row */
       struct list_head   exit_list;       /* Use only net_mutex */
 
 
       struct user_namespace   *user_ns;   /* Owning user namespace */
       spinlock_t            nsid_lock;
       struct idr             netns_ids;
 
 
       struct ns_common     ns;
 
 
       struct proc_dir_entry       *proc_net;
       struct proc_dir_entry       *proc_net_stat;
 
 
#ifdef CONFIG_SYSCTL
       struct ctl_table_set   sysctls;
#endif
 
 
       struct sock          *rtnl;                   /* rtnetlink socket */
       struct sock           *genl_sock;
 
 
       struct list_head dev_base_head;
       struct hlist_head       *dev_name_head;
       struct hlist_head *dev_index_head;
       unsigned int        dev_base_seq;    /* protected by rtnl_mutex */
       int                 ifindex;
       unsigned int        dev_unreg_count;
 
 
       /* core fib_rules */
       struct list_head   rules_ops;
 
       struct net_device       *loopback_dev;          /* The loopback */
       struct netns_core       core;
       struct netns_mib mib;
       struct netns_packet   packet;
       struct netns_unix       unx;
       struct netns_ipv4       ipv4;
#if IS_ENABLED(CONFIG_IPV6)
       struct netns_ipv6       ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
       struct netns_ieee802154_lowpan  ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
       struct netns_sctp       sctp;
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
       struct netns_dccp      dccp;
#endif
#ifdef CONFIG_NETFILTER
       struct netns_nf          nf;
       struct netns_xt           xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
       struct netns_ct           ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
       struct netns_nftables nft;
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
       struct netns_nf_frag  nf_frag;
#endif
       struct sock           *nfnl;
       struct sock           *nfnl_stash;
#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)
       struct list_head        nfnl_acct_list;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
       struct sk_buff_head   wext_nlevents;
#endif
       struct net_generic __rcu  *gen;
 
 
       /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
       struct netns_xfrm      xfrm;
#endif
#if IS_ENABLED(CONFIG_IP_VS)
       struct netns_ipvs *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
       struct netns_mpls      mpls;
#endif
       struct sock           *diag_nlsk;
       atomic_t             fnhe_genid;
};

task_struct, nsproxy, uts_ns, ipc_ns,….之间的结构关系如下:

##各个namespace的初始化 Kernel中有一个默认的nsproxy为init_nsproxy, init_nsproxy在task initialize的时候会被初始化。Init_nsproxy的定义在linux-4.4.19/include/linux/init_task.h #232

#define INIT_TASK(tsk)   
{
…
nsproxy  = &init_nsproxy,
…
}
继续跟进init_nsproxy的定义,@ linux-4.4.19/kernel/nsproxy.c #31
struct nsproxy init_nsproxy = {
       .count                  = ATOMIC_INIT(1),
       .uts_ns                = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
       .ipc_ns                 = &init_ipc_ns,
#endif
       .mnt_ns               = NULL,
       .pid_ns_for_children  = &init_pid_ns,
#ifdef CONFIG_NET
       .net_ns                = &init_net,
#endif
};

可见,系统初始化task时,完成了对uts, ipc, pid, net的默认初始化工作,唯独mount没有。具体的各个namespace的initial code location如下:

  • init_pid_ns —— linux-4.4.19/kernel/pid.c #70
  • init_uts_ns —— linux-4.4.19/kernel/user.c #25
  • init_ipc_ns —— linux-4.4.19/ipc/msgutil.c #31
  • init_net —— linux-4.4.19/net/core/net_namespace.c #35

###The workflow of Create Namespace 系统如何Create New Namespace? The answer is: int clone (int (fn) (void), void *child_stack, int flags, void *arg) clone()是libc库中封装的函数,我们不对其进行深究。在linux kernel中,fork/vfork对clone进行了封装,代码在linux/linux-4.4.19/kernel/fork.c #1808-1833

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
       /* can not support in nommu mode */
       return -EINVAL;
#endif
}
#endif
 
 
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
       return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
                     0, NULL, NULL, 0);
}
#endif
  
 
 
#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
               int __user *, parent_tidptr,
               unsigned long, tls,
               int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
               int __user *, parent_tidptr,
               int __user *, child_tidptr,
               unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
              int, stack_size,
              int __user *, parent_tidptr,
              int __user *, child_tidptr,
              unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
               int __user *, parent_tidptr,
               int __user *, child_tidptr,
               unsigned long, tls)
#endif
{
       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}

ork 通过0x80中断(系统调用)来陷入内核,由系统提供的相应系统调用来完成进程的创建。 上面显示,不论是vfork还是fork来创建新城,最终都是通过 _do_fork()来负责实现。 跟踪到linux/linux-4.4.19/kernel/fork.c #1693,查看_do_fork()函数的实现:

long _do_fork(unsigned long clone_flags,
             unsigned long stack_start,
             unsigned long stack_size,
             int __user *parent_tidptr,
             int __user *child_tidptr,
             unsigned long tls)
{
 
 
       //创建进程描述符指针
       struct task_struct *p;
       int trace = 0;
       long nr;
 
 
       if (!(clone_flags & CLONE_UNTRACED)) {
              if (clone_flags & CLONE_VFORK)
                     trace = PTRACE_EVENT_VFORK;
              else if ((clone_flags & CSIGNAL) != SIGCHLD)
                     trace = PTRACE_EVENT_CLONE;
              else
                     trace = PTRACE_EVENT_FORK;
 
 
              if (likely(!ptrace_event_enabled(current, trace)))
                     trace = 0;
       }
  
       //复制进程描述符,copy_process()的返回值是一个 task_struct 指针。
       p = copy_process(clone_flags, stack_start, stack_size,
                      child_tidptr, NULL, trace, tls);
       
       if (!IS_ERR(p)) {
              struct completion vfork;
              struct pid *pid;
 
 
              trace_sched_process_fork(current, p);
  
              //得到新创建的进程描述符中的pid
              pid = get_task_pid(p, PIDTYPE_PID);
              nr = pid_vnr(pid);
 
 
              if (clone_flags & CLONE_PARENT_SETTID)
                     put_user(nr, parent_tidptr);
 
 
              //如果调用的 vfork()方法,初始化 vfork 完成处理信息。
              if (clone_flags & CLONE_VFORK) {
                     p->vfork_done = &vfork;
                     init_completion(&vfork);
                     get_task_struct(p);
              }
 
 
              //将子进程加入到调度器中,为其分配 CPU,准备执行
              wake_up_new_task(p);
 
 
              //fork 完成,子进程即将开始运行
              if (unlikely(trace))
                     ptrace_event_pid(trace, pid);
 
 
               //如果是 vfork,将父进程加入至等待队列,等待子进程完成
              if (clone_flags & CLONE_VFORK) {
                     if (!wait_for_vfork_done(p, &vfork))
                            ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
              }
 
 
              put_pid(pid);
       } else {
              nr = PTR_ERR(p);
       }
       return nr;
}

_do_fork 流程

  • 调用 copy_process 为子进程复制出一份进程信息
  • 如果是 vfork 初始化完成处理信息
  • 调用 wake_up_new_task 将子进程加入调度器,为之分配 CPU
  • 如果是 vfork,父进程等待子进程完成 exec 替换自己的地址空间

##copy_process源码分析 linux/linux-4.4.19/kernel/fork.c #1243中,定义了copy_process()函数的实现:

static struct task_struct *copy_process(unsigned long clone_flags,
                                   unsigned long stack_start,
                                   unsigned long stack_size,
                                   int __user *child_tidptr,
                                   struct pid *pid,
                                   int trace,
                                   unsigned long tls)
{
       int retval;
 
        //创建进程描述符指针
       struct task_struct *p;
       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
 
        // 检查clone flags的合法性,比如CLONE_NEWNS与CLONE_FS是互斥的;
       if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
              return ERR_PTR(-EINVAL);
       if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
              return ERR_PTR(-EINVAL);
       if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
              return ERR_PTR(-EINVAL);
       if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
              return ERR_PTR(-EINVAL);
 
       // 比如CLONE_PARENT时得检查当前signal flags是否为SIGNAL_UNKILLABLE,防止kill init进程。
       if ((clone_flags & CLONE_PARENT) &&
                            current->signal->flags & SIGNAL_UNKILLABLE)
              return ERR_PTR(-EINVAL);
       if (clone_flags & CLONE_THREAD) {
              if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                  (task_active_pid_ns(current) !=
                            current->nsproxy->pid_ns_for_children))
                     return ERR_PTR(-EINVAL);
       }
       retval = security_task_create(clone_flags);
       if (retval)
              goto fork_out;
       retval = -ENOMEM;
 
 
       // 复制当前的task_struct
       p = dup_task_struct(current);
       if (!p)
              goto fork_out;
 
       ftrace_graph_init_task(p);
       
       // 初始化互斥变量
       rt_mutex_init_task(p);
 
 
#ifdef CONFIG_PROVE_LOCKING
       DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
       DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
       retval = -EAGAIN;
 
 
       // 检查进程数是否是超过限制,由OS定义
       if (atomic_read(&p->real_cred->user->processes) >=
                     task_rlimit(p, RLIMIT_NPROC)) {
              if (p->real_cred->user != INIT_USER &&
                  !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                     goto bad_fork_free;
       }
       current->flags &= ~PF_NPROC_EXCEEDED;
       retval = copy_creds(p, clone_flags);
       if (retval < 0)
              goto bad_fork_free;
       retval = -EAGAIN;
  
        //检查进程数是否超过 max_threads 由内存大小决定
       if (nr_threads >= max_threads)
              goto bad_fork_cleanup_count;
 
       delayacct_tsk_init(p);       /* Must remain after dup_task_struct() */
       p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
       p->flags |= PF_FORKNOEXEC;
       INIT_LIST_HEAD(&p->children);
       INIT_LIST_HEAD(&p->sibling);
       rcu_copy_process(p);
       p->vfork_done = NULL;
 
 
       //初始化自旋锁
       spin_lock_init(&p->alloc_lock);
       //初始化挂起信号
       init_sigpending(&p->pending);
 
 
       p->utime = p->stime = p->gtime = 0;
       p->utimescaled = p->stimescaled = 0;
       prev_cputime_init(&p->prev_cputime);
 
 
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
       seqlock_init(&p->vtime_seqlock);
       p->vtime_snap = 0;
       p->vtime_snap_whence = VTIME_SLEEPING;
#endif
#if defined(SPLIT_RSS_COUNTING)
       memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
       p->default_timer_slack_ns = current->timer_slack_ns;
       task_io_accounting_init(&p->ioac);
       acct_clear_integrals(p);
 
       // 初始化CPU定时器
       posix_cpu_timers_init(p);
 
       p->start_time = ktime_get_ns();
       p->real_start_time = ktime_get_boot_ns();
       p->io_context = NULL;
       p->audit_context = NULL;
       threadgroup_change_begin(current);
       cgroup_fork(p);
        //......
 
        //初始化进程数据结构,并把进程状态设置为 TASK_RUNNING
       retval = sched_fork(clone_flags, p);
 
 
        //复制所有进程信息,包括文件系统、信号处理函数、信号、内存管理等
       if (retval)
              goto bad_fork_cleanup_policy;
 
       retval = perf_event_init_task(p);
       if (retval)
              goto bad_fork_cleanup_policy;
       retval = audit_alloc(p);
       if (retval)
              goto bad_fork_cleanup_perf;
       /* copy all the process information */
       shm_init_task(p);
       retval = copy_semundo(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_audit;
       retval = copy_files(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_semundo;
       retval = copy_fs(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_files;
       retval = copy_sighand(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_fs;
       retval = copy_signal(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_sighand;
       retval = copy_mm(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_signal;
 
       // 复制namespaces
       retval = copy_namespaces(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_mm;
       retval = copy_io(clone_flags, p);
       if (retval)
              goto bad_fork_cleanup_namespaces;
 
       // 初始化子进程内核栈
       retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
       if (retval)
              goto bad_fork_cleanup_io;
 
        //为新进程分配新的 pid
       if (pid != &init_struct_pid) {
              pid = alloc_pid(p->nsproxy->pid_ns_for_children);
              if (IS_ERR(pid)) {
                     retval = PTR_ERR(pid);
                     goto bad_fork_cleanup_io;
              }
       }
 
       p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
 
        //......
        //设置子进程 pid       
       p->pid = pid_nr(pid);
        //......
 
       /*
        * Ensure that the cgroup subsystem policies allow the new process to be
        * forked. It should be noted the the new process's css_set can be changed
        * between here and cgroup_post_fork() if an organisation operation is in
        * progress.
        */
       retval = cgroup_can_fork(p, cgrp_ss_priv);
       if (retval)
              goto bad_fork_free_pid;
 
 
       /*
        * Make it visible to the rest of the system, but dont wake it up yet.
        * Need tasklist lock for parent etc handling!
        */
       write_lock_irq(&tasklist_lock);
 
 
       /* CLONE_PARENT re-uses the old parent */
       if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
              p->real_parent = current->real_parent;
              p->parent_exec_id = current->parent_exec_id;
       } else {
              p->real_parent = current;
              p->parent_exec_id = current->self_exec_id;
       }
 
 
       spin_lock(&current->sighand->siglock);
 
 
       // 复制seccommp配置
       copy_seccomp(p);
 
 
       /*
        * Process group and session signals need to be delivered to just the
        * parent before the fork or both the parent and the child after the
        * fork. Restart if a signal comes in before we add the new process to
        * it's process group.
        * A fatal signal pending means that current will exit, so the new
        * thread can't slip out of an OOM kill (or normal SIGKILL).
       */
       recalc_sigpending();
       if (signal_pending(current)) {
              spin_unlock(&current->sighand->siglock);
              write_unlock_irq(&tasklist_lock);
              retval = -ERESTARTNOINTR;
              goto bad_fork_cancel_cgroup;
       }
 
        //......
 
       total_forks++;
       spin_unlock(&current->sighand->siglock);
       syscall_tracepoint_update(p);
       write_unlock_irq(&tasklist_lock);
 
       proc_fork_connector(p);
       cgroup_post_fork(p, cgrp_ss_priv);
       threadgroup_change_end(current);
       perf_event_fork(p);
 
       trace_task_newtask(p, clone_flags);
       uprobe_copy_process(p, clone_flags);
 
       // 返回结构体p
       return p;
}

copy_process流程:

  • 调用 dup_task_struct 复制当前的 task_struct,为新进程分配了新的堆栈
  • 检查进程数是否超过限制
  • 初始化自旋锁、挂起信号、CPU 定时器等
  • 调用 sched_fork 初始化进程数据结构,并把进程状态设置为 TASK_RUNNING
  • 复制所有进程信息,包括文件系统、信号处理函数、信号、内存管理等
  • 调用copy_namespaces复制namesapces
  • 调用 copy_thread 初始化子进程内核栈,将父进程的寄存器上下文copy给了子进程
  • 为新进程分配并设置新的 pid

###dup_task_struct源码分析 dup_task_struct()函数的定义在linux-4.4.19/kernel/fork.c #334

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
       struct task_struct *tsk;
       struct thread_info *ti;
       int node = tsk_fork_get_node(orig);
       int err;
 
       //分配一个 task_struct 节点
       tsk = alloc_task_struct_node(node);
       if (!tsk)
              return NULL;
 
       //分配一个 thread_info 节点,包含进程的内核栈,ti 为栈底
       ti = alloc_thread_info_node(tsk, node);
       if (!ti)
              goto free_tsk;
 
       //将栈底的值赋给新节点的栈
       tsk->stack = ti;
 
       //……
 
       return tsk;
}

dup_task_struct流程:

  • 调用alloc_task_struct_node分配一个 task_struct 节点
  • 调用alloc_thread_info_node分配一个 thread_info 节点,其实是分配了一个thread_union联合体,将栈底返回给 ti
union thread_union {
   struct thread_info thread_info;
  unsigned long stack[THREAD_SIZE/sizeof(long)];
};
  • 最后将栈底的值 ti 赋值给新节点的栈
  • 最终执行完dup_task_struct之后,子进程除了tsk->stack指针不同之外。 ###sched_fork源码分析 linux-4.4.19/kernel/sched/core.c #2187
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
       unsigned long flags;
       int cpu = get_cpu();
 
 
       __sched_fork(clone_flags, p);
         
       //将子进程状态设置为 TASK_RUNNING
       p->state = TASK_RUNNING;
       
       //……
       
       //为子进程分配 CPU
       set_task_cpu(p, cpu);
 
 
       put_cpu();
       return 0;
}

我们可以看到sched_fork大致完成了两项重要工作,一是将子进程状态设置为 TASK_RUNNING,二是为其分配 CPU ###copy_thread_tls源码分析 linux-4.4.19/arch/x86/kernel/process_64.c #156

int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
              unsigned long arg, struct task_struct *p, unsigned long tls)
{
       int err;
       struct pt_regs *childregs;
       struct task_struct *me = current;
 
 
       p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 
 
       // 获取寄存器信息
       childregs = task_pt_regs(p);
       p->thread.sp = (unsigned long) childregs;
       set_tsk_thread_flag(p, TIF_FORK);
       p->thread.io_bitmap_ptr = NULL;
 
 
       savesegment(gs, p->thread.gsindex);
       p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
       savesegment(fs, p->thread.fsindex);
       p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
       savesegment(es, p->thread.es);
       savesegment(ds, p->thread.ds);
       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 
       if (unlikely(p->flags & PF_KTHREAD)) {
              /* kernel thread */
              memset(childregs, 0, sizeof(struct pt_regs));
              childregs->sp = (unsigned long)childregs;
              childregs->ss = __KERNEL_DS;
              childregs->bx = sp; /* function */
              childregs->bp = arg;
              childregs->orig_ax = -1;
              childregs->cs = __KERNEL_CS | get_kernel_rpl();
              childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
              return 0;
       }
 
 
       // 将当前寄存器信息复制给子进程
       *childregs = *current_pt_regs();
 
 
       //子进程 eax 置 0,因此fork 在子进程返回0
       childregs->ax = 0;
       if (sp)
              childregs->sp = sp;
 
 
       err = -ENOMEM;
       if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
              p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
                                            IO_BITMAP_BYTES, GFP_KERNEL);
              if (!p->thread.io_bitmap_ptr) {
                     p->thread.io_bitmap_max = 0;
                     return -ENOMEM;
              }
              set_tsk_thread_flag(p, TIF_IO_BITMAP);
       }
 
 
       /*
        * Set a new TLS for the child thread?
        */
       if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
              if (is_ia32_task())
                     err = do_set_thread_area(p, -1,
                            (struct user_desc __user *)tls, 0);
              else
#endif
                     err = do_arch_prctl(p, ARCH_SET_FS, tls);
              if (err)
                     goto out;
       }
       err = 0;
out:
       if (err && p->thread.io_bitmap_ptr) {
              kfree(p->thread.io_bitmap_ptr);
              p->thread.io_bitmap_max = 0;
       }
 
 
       return err;
}

##Copy_namesapces源码分析 Copy_namespaces()函数的定义在linux-4.4.19/kernel/nsproxy.c #124

int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
       struct nsproxy *old_ns = tsk->nsproxy;
       struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
       struct nsproxy *new_ns;
 
 
       if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                           CLONE_NEWPID | CLONE_NEWNET)))) {
              get_nsproxy(old_ns);
              return 0;
       }
 
 
       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
              return -EPERM;
 
 
       /*
        * CLONE_NEWIPC must detach from the undolist: after switching
        * to a new ipc namespace, the semaphore arrays from the old
        * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
        * means share undolist with parent, so we must forbid using
        * it along with CLONE_NEWIPC.
        */
       if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
              (CLONE_NEWIPC | CLONE_SYSVSEM))
              return -EINVAL;
 
 
       new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
       if (IS_ERR(new_ns))
              return  PTR_ERR(new_ns);
 
 
       tsk->nsproxy = new_ns;
       return 0;
}

###Create_new_namespaces源码分析 Create_new_namespaces()函数实现定义在linux-4.4.19/kernel/nsproxy.c #59

/*
 * Create new nsproxy and all of its the associated namespaces.
 * Return the newly created nsproxy.  Do not attach this to the task,
 * leave it to the caller to do proper locking and attach it to task.
 */
static struct nsproxy *create_new_namespaces(unsigned long flags,
       struct task_struct *tsk, struct user_namespace *user_ns,
       struct fs_struct *new_fs)
{
       struct nsproxy *new_nsp;
       int err;
 
 
       new_nsp = create_nsproxy();
       if (!new_nsp)
              return ERR_PTR(-ENOMEM);
 
 
       new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
       if (IS_ERR(new_nsp->mnt_ns)) {
              err = PTR_ERR(new_nsp->mnt_ns);
              goto out_ns;
       }
 
 
       new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
       if (IS_ERR(new_nsp->uts_ns)) {
              err = PTR_ERR(new_nsp->uts_ns);
              goto out_uts;
       }
 
 
       new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
       if (IS_ERR(new_nsp->ipc_ns)) {
              err = PTR_ERR(new_nsp->ipc_ns);
              goto out_ipc;
       }
 
 
       new_nsp->pid_ns_for_children =
              copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
       if (IS_ERR(new_nsp->pid_ns_for_children)) {
              err = PTR_ERR(new_nsp->pid_ns_for_children);
              goto out_pid;
       }
 
 
       new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
       if (IS_ERR(new_nsp->net_ns)) {
              err = PTR_ERR(new_nsp->net_ns);
              goto out_net;
       }
 
 
       return new_nsp;
 
 
out_net:
       if (new_nsp->pid_ns_for_children)
              put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
       if (new_nsp->ipc_ns)
              put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
       if (new_nsp->uts_ns)
              put_uts_ns(new_nsp->uts_ns);
out_uts:
       if (new_nsp->mnt_ns)
              put_mnt_ns(new_nsp->mnt_ns);
out_ns:
       kmem_cache_free(nsproxy_cachep, new_nsp);
       return ERR_PTR(err);
}

在copy_namespaces()中,分别调用create_nsproxy(), create_utsname(), create_ipcs(), create_pid_ns(), create_net_ns(), create_mnt_ns(),具体的实现请参考如下索引。 create_nsproxy() —— linux-4.4.19/kernel/nsproxy.c #44 create_utsname() ——linux-4.4.19/kernel/utsname.c #66 create_mnt_ns() ——linux-4.4.19/fs/namespace.c #2775 create_ipcs() —— linux-4.4.19/ipc/namespace.c #54 create_pid_ns() —— linux-4.4.19/kernel/pid_namespace.c #153 create_net_ns() —— linux-4.4.19/net/core/net_namespace.c #351 create new namespace代码流程图:

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏Kubernetes

原 荐 Kubernetes Statefu

Author: xidianwangtao@gmail.com,Based on Kubernetes 1.9 摘要:Kubernetes Statef...

4988
来自专栏Android 研究

APK安装流程详解2——PackageManager简介

俗话说的好,得中原者,得天下,那么想要了解Android的安装了流程就不得不提及一个重要的类"PackageManager"我们就先来了解这两个类

2833
来自专栏架构师之旅

《Spring敲门砖之基础教程第一季》 第二章(1) Spring框架之IOC首例-HelloWorld

回顾 上一章我们主要学习了Spring的一些理论知识,对Spring框架有了一个总体的概括,大家应该在头脑里形成一个初步的印象,接下来我们就会针对Spring框...

19910
来自专栏noteless

springmvc 项目完整示例02 项目创建-eclipse创建动态web项目 配置文件 junit单元测试

spring原理案例-基本项目搭建 01 spring framework 下载 官网下载spring jar包

1742
来自专栏Java编程技术

Netty学习笔记(一)

Netty是一种可以轻松快速的开发类似协议服务器和客户端网络应用程序的NIO客户端服务器框架,它大大简化了TCP或者UDP服务器的网络编程,但是你仍然可以访问和...

1562
来自专栏kl的专栏

spring batch进阶-基于RabbitMQ远程分区Step

关于spring batch概念及基本使用,可移步《spring batch精选,一文吃透spring batch》,本文主要内容为spring batch的进...

6467
来自专栏ImportSource

针对事件驱动架构的Spring Cloud Stream

今天我们要分享一个比较有意思的内容。就是如何通过spring cloud 的stream来改造一个微服务下事件驱动的框架。 为什么要改造?我们都知道事件驱动的微...

4458
来自专栏WindCoder

SpringBoot邂逅Shiro-前后端分离时的配置

本篇仅是记录集成的基础过程,至于shiro框架的基础概念和使用细节,可以自行查阅相关资料,本文不做讨论。

4.3K2
来自专栏JMCui

Netty 系列八(基于 WebSocket 的简单聊天室).

    之前写过一篇 Spring 集成 WebSocket 协议的文章 —— Spring消息之WebSocket ,所以对于 WebSocket 协议的介绍...

2475
来自专栏码匠的流水账

jvm排查工具箱jvm-tools

本文主要介绍的是一款jvm排查工具箱:jvm-tools。除了对基本jvm封装外,还提供了jmx访问以及火焰图的生成。

1091

扫码关注云+社区

领取腾讯云代金券