Linux tcp/ip 源码分析 - socket

Linux下的tcp编程中,第一步就是要创建socket,本文将从源码角度看下socket是如何被创建的。

本文使用的linux内核版本为

➜ bionic git:(ffdd392b8196) git remote get-url origin git://git.launchpad.net/~ubuntu-kernel/ubuntu/+source/linux/+git/bionic ➜ bionic git:(ffdd392b8196) git status HEAD detached at Ubuntu-4.15.0-45.48

有关如何找到当前运行的Ubuntu版本对应的内核源码,请参考 找到运行的Ubuntu版本对应的内核源码

在看具体的源码分析之前,最好先看下socket的man文档,这样能对socket api有个大概的了解。

man 2 socket

socket系统调用对应的内核源码为

// net/socket.c SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { ... struct socket *sock; ... flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; ... retval = sock_create(family, type, protocol, &sock); ... return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); }

sock_create 方法会创建并初始化socket,sock_map_fd 方法会为socket分配文件并返回对应的文件描述符。

在看sock_create实现之前,我们先看下sock_map_fd方法。

// net/socket.c static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); ... newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { fd_install(fd, newfile); return fd; } ... }

该方法大体操作为

1. 找到一个未使用的文件描述符fd。

2. 为socket分配一个struct file实例。

3. 建立fd到socket file的映射关系,并返回fd给上层。

再看下sock_alloc_file方法

// net/socket.c struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { ... struct file *file; ... file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); ... sock->file = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->private_data = sock; return file; } EXPORT_SYMBOL(sock_alloc_file);

该方法会调用alloc_file方法创建并初始化struct file实例,之后将file的priate_data字段设置为socket,这样内核就可以先根据fd找到file,再根据file的private_data字段就能找到socket。

看下alloc_file方法

// fs/file_table.c struct file *alloc_file(const struct path *path, fmode_t mode, const struct file_operations *fop) { struct file *file; file = get_empty_filp(); ... file->f_mode = mode; file->f_op = fop; ... return file; } EXPORT_SYMBOL(alloc_file);

该方法我们需要关注的就是file的f_op字段,该字段类型为struct file_operations,指向的值为socket_file_ops。

struct file_operations 的内容为文件操作的各种函数指针,比如read、write等。不同类型的file的f_op字段指向不同的值,这样在执行文件操作时,不同类型的文件,行为也是不同的。

// include/linux/fs.h struct file_operations { ... ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); ... unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); ... int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); ... } __randomize_layout;

好,sock_map_fd方法我们就讲完了,我们再回头看下socket方法里的sock_create方法

// net/socket.c int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create);

再看下__sock_create方法

// net/socket.c int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { ... struct socket *sock; const struct net_proto_family *pf; ... sock = sock_alloc(); ... sock->type = type; ... pf = rcu_dereference(net_families[family]); ... err = pf->create(net, sock, protocol, kern); ... *res = sock; return 0; ... } EXPORT_SYMBOL(__sock_create);

该方法会先调用sock_alloc,分配一个socket实例,socket类型为

// include/linux/net.h struct socket { socket_state state; short type; unsigned long flags; struct socket_wq __rcu *wq; struct file *file; struct sock *sk; const struct proto_ops *ops; };

然后调用net_proto_family的create方法初始化socket实例中的各字段。

当我们创建基于ipv4的tcp连接时,family值一般为AF_INET,net_families[family]的值为inet_family_ops。

看下inet_family_ops变量的定义

// net/ipv4/af_inet.c static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };

所以,上面调用的pf->create方法其实对应的就是inet_create方法。

// net/ipv4/af_inet.c static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; ... sock->state = SS_UNCONNECTED; ... list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { // 根据type和protocol,找到对应的struct inet_protosw类型的实例 } ... sock->ops = answer->ops; answer_prot = answer->prot; ... sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); ... sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; ... if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); ... } ... return err; ... }

该方法的大体操作为

1. 遍历inetsw[sock->type]列表,根据type和protocol找到对应的struct inet_protosw实例,answer字段持有该实例。

对于ipv4的tcp连接来说,type一般为SOCK_STREAM,protocol一般为0,其对应的struct inet_protosw实例为

// net/ipv4/af_inet.c static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, ... };

2. 将answer->ops字段赋值给sock->ops,最终值为&inet_stream_ops。

// net/ipv4/af_inet.c const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_stream_ops);

3. 调用sk_alloc方法,分配struct sock类型实例。

// net/core/sock.c struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; ... sk->sk_prot = sk->sk_prot_creator = prot; ... } return sk; } EXPORT_SYMBOL(sk_alloc);

该方法需要注意的是,会将sk->sk_prot字段赋值为prot,即&tcp_prot,该字段在tcp相关逻辑中经常会用到。

// net/ipv4/tcp_ipv4.c struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot);

4. 调用sock_init_data方法,初始化struct sock实例sk。

// net/core/sock.c void sock_init_data(struct socket *sock, struct sock *sk) { sk_init_common(sk); ... sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; ... sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; ... } EXPORT_SYMBOL(sock_init_data);

该方法需要注意的是以下几个字段

sk->sk_state_change 状态变化时调用

sk->sk_data_ready 有可读数据时调用

sk->sk_write_space 有可写空间时调用

sk->sk_error_report 发生错误时调用

这几个字段指向的值都是相应的回调方法,当相应的事件发生时,这些回调方法就会被调用,通知上层事件发生。

epoll的通知就是用这些回调来实现的。

有关epoll的源码分析,请参见

Linux epoll 源码分析 1

Linux epoll 源码分析 2

Linux epoll 源码分析 3

5. 调用sk->sk_prot->init(sk) 继续初始化sk。

由上我们可以得知,sk->sk_prot的值为&tcp_prot,所以此处的init方法对应为tcp_v4_init_sock。

// net/ipv4/tcp_ipv4.c static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv4_specific; ... return 0; }

该方法需要注意的是icsk->icsk_af_ops字段的值为&ipv4_specific。

// net/ipv4/tcp_ipv4.c const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific);

tcp_v4_init_sock方法还调用了tcp_init_sock方法对tcp逻辑进行了初始化。

// net/ipv4/tcp.c[ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); ... sk->sk_write_space = sk_stream_write_space; ... } EXPORT_SYMBOL(tcp_init_sock);

该方法需要注意的就是,sk->sk_write_space的值又被赋值为sk_stream_write_space,即socket可写的回调对于tcp来说是sk_stream_write_space,而不是默认的sock_def_write_space。

至此,socket的创建并初始化就全部完成了。

我们再来总结下tcp socket重要字段的最终值是什么

1. 根据文件描述符fd可找到struct file实例file。

2. file->f_op字段值为&socket_file_ops。

3. 根据file->private_data字段可获得struct socket实例sock。

4. sock->ops字段值为&inet_stream_ops。

5. sock->sk字段对应的值的类型为struct sock,该类型为内核内部实际用来存储socket数据的地方。

6. sk->sk_prot字段的值为&tcp_prot。

7. sk->sk_state字段值为TCP_CLOSE。

8. sk状态变化回调函数分别为

sk->sk_state_change = sock_def_wakeup;

sk->sk_data_ready = sock_def_readable;

sk->sk_write_space = sk_stream_write_space;

sk->sk_error_report = sock_def_error_report

原文发布于微信公众号 - Linux内核及JVM底层相关技术研究(ytcode)

原文发表时间:2019-02-25

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券