前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Linux tcp/ip 源码分析 - socket

Linux tcp/ip 源码分析 - socket

作者头像
KINGYT
发布2019-05-30 19:18:46
5.7K0
发布2019-05-30 19:18:46
举报

Linux下的tcp编程中,第一步就是要创建socket,本文将从源码角度看下socket是如何被创建的。

本文使用的linux内核版本为

➜ bionic git:(ffdd392b8196) git remote get-url origin git://git.launchpad.net/~ubuntu-kernel/ubuntu/+source/linux/+git/bionic ➜ bionic git:(ffdd392b8196) git status HEAD detached at Ubuntu-4.15.0-45.48

有关如何找到当前运行的Ubuntu版本对应的内核源码,请参考 找到运行的Ubuntu版本对应的内核源码

在看具体的源码分析之前,最好先看下socket的man文档,这样能对socket api有个大概的了解。

man 2 socket

socket系统调用对应的内核源码为

// net/socket.c SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { ... struct socket *sock; ... flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; ... retval = sock_create(family, type, protocol, &sock); ... return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); }

sock_create 方法会创建并初始化socket,sock_map_fd 方法会为socket分配文件并返回对应的文件描述符。

在看sock_create实现之前,我们先看下sock_map_fd方法。

// net/socket.c static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); ... newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { fd_install(fd, newfile); return fd; } ... }

该方法大体操作为

1. 找到一个未使用的文件描述符fd。

2. 为socket分配一个struct file实例。

3. 建立fd到socket file的映射关系,并返回fd给上层。

再看下sock_alloc_file方法

// net/socket.c struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { ... struct file *file; ... file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); ... sock->file = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->private_data = sock; return file; } EXPORT_SYMBOL(sock_alloc_file);

该方法会调用alloc_file方法创建并初始化struct file实例,之后将file的priate_data字段设置为socket,这样内核就可以先根据fd找到file,再根据file的private_data字段就能找到socket。

看下alloc_file方法

// fs/file_table.c struct file *alloc_file(const struct path *path, fmode_t mode, const struct file_operations *fop) { struct file *file; file = get_empty_filp(); ... file->f_mode = mode; file->f_op = fop; ... return file; } EXPORT_SYMBOL(alloc_file);

该方法我们需要关注的就是file的f_op字段,该字段类型为struct file_operations,指向的值为socket_file_ops。

struct file_operations 的内容为文件操作的各种函数指针,比如read、write等。不同类型的file的f_op字段指向不同的值,这样在执行文件操作时,不同类型的文件,行为也是不同的。

// include/linux/fs.h struct file_operations { ... ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); ... unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); ... int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); ... } __randomize_layout;

好,sock_map_fd方法我们就讲完了,我们再回头看下socket方法里的sock_create方法

// net/socket.c int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create);

再看下__sock_create方法

// net/socket.c int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { ... struct socket *sock; const struct net_proto_family *pf; ... sock = sock_alloc(); ... sock->type = type; ... pf = rcu_dereference(net_families[family]); ... err = pf->create(net, sock, protocol, kern); ... *res = sock; return 0; ... } EXPORT_SYMBOL(__sock_create);

该方法会先调用sock_alloc,分配一个socket实例,socket类型为

// include/linux/net.h struct socket { socket_state state; short type; unsigned long flags; struct socket_wq __rcu *wq; struct file *file; struct sock *sk; const struct proto_ops *ops; };

然后调用net_proto_family的create方法初始化socket实例中的各字段。

当我们创建基于ipv4的tcp连接时,family值一般为AF_INET,net_families[family]的值为inet_family_ops。

看下inet_family_ops变量的定义

// net/ipv4/af_inet.c static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };

所以,上面调用的pf->create方法其实对应的就是inet_create方法。

// net/ipv4/af_inet.c static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; ... sock->state = SS_UNCONNECTED; ... list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { // 根据type和protocol,找到对应的struct inet_protosw类型的实例 } ... sock->ops = answer->ops; answer_prot = answer->prot; ... sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); ... sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; ... if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); ... } ... return err; ... }

该方法的大体操作为

1. 遍历inetsw[sock->type]列表,根据type和protocol找到对应的struct inet_protosw实例,answer字段持有该实例。

对于ipv4的tcp连接来说,type一般为SOCK_STREAM,protocol一般为0,其对应的struct inet_protosw实例为

// net/ipv4/af_inet.c static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, ... };

2. 将answer->ops字段赋值给sock->ops,最终值为&inet_stream_ops。

// net/ipv4/af_inet.c const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_stream_ops);

3. 调用sk_alloc方法,分配struct sock类型实例。

// net/core/sock.c struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; ... sk->sk_prot = sk->sk_prot_creator = prot; ... } return sk; } EXPORT_SYMBOL(sk_alloc);

该方法需要注意的是,会将sk->sk_prot字段赋值为prot,即&tcp_prot,该字段在tcp相关逻辑中经常会用到。

// net/ipv4/tcp_ipv4.c struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot);

4. 调用sock_init_data方法,初始化struct sock实例sk。

// net/core/sock.c void sock_init_data(struct socket *sock, struct sock *sk) { sk_init_common(sk); ... sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; ... sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; ... } EXPORT_SYMBOL(sock_init_data);

该方法需要注意的是以下几个字段

sk->sk_state_change 状态变化时调用

sk->sk_data_ready 有可读数据时调用

sk->sk_write_space 有可写空间时调用

sk->sk_error_report 发生错误时调用

这几个字段指向的值都是相应的回调方法,当相应的事件发生时,这些回调方法就会被调用,通知上层事件发生。

epoll的通知就是用这些回调来实现的。

有关epoll的源码分析,请参见

Linux epoll 源码分析 1

Linux epoll 源码分析 2

Linux epoll 源码分析 3

5. 调用sk->sk_prot->init(sk) 继续初始化sk。

由上我们可以得知,sk->sk_prot的值为&tcp_prot,所以此处的init方法对应为tcp_v4_init_sock。

// net/ipv4/tcp_ipv4.c static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv4_specific; ... return 0; }

该方法需要注意的是icsk->icsk_af_ops字段的值为&ipv4_specific。

// net/ipv4/tcp_ipv4.c const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific);

tcp_v4_init_sock方法还调用了tcp_init_sock方法对tcp逻辑进行了初始化。

// net/ipv4/tcp.c[ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); ... sk->sk_write_space = sk_stream_write_space; ... } EXPORT_SYMBOL(tcp_init_sock);

该方法需要注意的就是,sk->sk_write_space的值又被赋值为sk_stream_write_space,即socket可写的回调对于tcp来说是sk_stream_write_space,而不是默认的sock_def_write_space。

至此,socket的创建并初始化就全部完成了。

我们再来总结下tcp socket重要字段的最终值是什么

1. 根据文件描述符fd可找到struct file实例file。

2. file->f_op字段值为&socket_file_ops。

3. 根据file->private_data字段可获得struct socket实例sock。

4. sock->ops字段值为&inet_stream_ops。

5. sock->sk字段对应的值的类型为struct sock,该类型为内核内部实际用来存储socket数据的地方。

6. sk->sk_prot字段的值为&tcp_prot。

7. sk->sk_state字段值为TCP_CLOSE。

8. sk状态变化回调函数分别为

sk->sk_state_change = sock_def_wakeup;

sk->sk_data_ready = sock_def_readable;

sk->sk_write_space = sk_stream_write_space;

sk->sk_error_report = sock_def_error_report

本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2019-02-25,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 Linux内核及JVM底层相关技术研究 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档