在上一篇<dpvs源码分析(续)>中,我们以tcp为例,讲到了连接的建立,同时也提到了full-nat,snat这些术语。在该篇中,我们再来讲讲连接建立的过程。
static int tcp_conn_sched(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, struct dp_vs_conn **conn, int *verdict) { struct tcphdr *th, _tcph; struct dp_vs_service *svc; assert(proto && iph && mbuf && conn && verdict); th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph); if (unlikely(!th)) { *verdict = INET_DROP; return EDPVS_INVPKT; } /* Syn-proxy step 2 logic: receive client's 3-handshacke ack packet */ /* When synproxy disabled, only SYN packets can arrive here. * So don't judge SYNPROXY flag here! If SYNPROXY flag judged, and syn_proxy * got disbled and keepalived reloaded, SYN packets for RS may never be sent. */ if (dp_vs_synproxy_ack_rcv(iph->af, mbuf, th, proto, conn, iph, verdict) == 0) { /* Attention: First ACK packet is also stored in conn->ack_mbuf */ return EDPVS_PKTSTOLEN; } .................. svc = dp_vs_service_lookup(iph->af, iph->proto, &iph->daddr, th->dest, 0, mbuf, NULL); if (!svc) { *verdict = INET_ACCEPT; return EDPVS_NOSERV; } .............................. *conn = dp_vs_schedule(svc, iph, mbuf, false); if (!*conn) { dp_vs_service_put(svc); *verdict = INET_DROP; return EDPVS_RESOURCE; } dp_vs_service_put(svc); return EDPVS_OK; }
代码依然是有删减的,删除的部分用"..................."替代了。但是主体逻辑依然存在。该函数中dp_vs_synproxy_ack_rcv
是tcp握手代理,也就是说client先和部署了dpvs的机器进行3次握手,完了之后,dpvs机器才会和真正的服务器RS去握手,此处细节就不深究了。然后第二个调用是dp_vs_service_lookup,这个函数干嘛呢?看名字大概就能猜出来了——通过传递的参数,协议,ip, port查找service。service是dest的集合,dest就是真正的服务器。找了了svc之后,就可以准备建立连接了。
2,dp_vs_schedule
/* select an RS by service's scheduler and create a connection */ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, bool is_synproxy_on) { uint16_t _ports[2], *ports; /* sport, dport */ struct dp_vs_dest *dest; struct dp_vs_conn *conn; struct dp_vs_conn_param param; struct sockaddr_in daddr, saddr; int err; assert(svc && iph && mbuf); ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports); if (!ports) return NULL; /* persistent service */ if (svc->flags & DP_VS_SVC_F_PERSISTENT) return dp_vs_sched_persist(svc, iph, mbuf, is_synproxy_on); dest = svc->scheduler->schedule(svc, mbuf); if (!dest) { RTE_LOG(WARNING, IPVS, "%s: no dest found.\n", __func__); return NULL; } if (dest->fwdmode == DPVS_FWD_MODE_SNAT) { if (unlikely(iph->proto == IPPROTO_ICMP)) { struct icmphdr *ich, _icmph; ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); if (!ich) return NULL; ports = _ports; _ports[0] = icmp4_id(ich); _ports[1] = ich->type << 8 | ich->code; /* ID may confict for diff host, * need we use ID pool ? */ dp_vs_conn_fill_param(iph->af, iph->proto, &iph->daddr, &dest->addr, ports[1], ports[0], 0, ¶m); } else { /* we cannot inherit dest (host's src port), * that may confict for diff hosts, * and using dest->port is worse choice. */ memset(&daddr, 0, sizeof(daddr)); daddr.sin_family = AF_INET; daddr.sin_addr = iph->daddr.in; daddr.sin_port = ports[1]; memset(&saddr, 0, sizeof(saddr)); saddr.sin_family = AF_INET; saddr.sin_addr = dest->addr.in; saddr.sin_port = 0; err = sa_fetch(NULL, &daddr, &saddr); if (err != 0) return NULL; dp_vs_conn_fill_param(iph->af, iph->proto, &iph->daddr, &dest->addr, ports[1], saddr.sin_port, 0, ¶m); } } else { if (unlikely(iph->proto == IPPROTO_ICMP)) { struct icmphdr *ich, _icmph; ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); if (!ich) return NULL; ports = _ports; _ports[0] = icmp4_id(ich); _ports[1] = ich->type << 8 | ich->code; dp_vs_conn_fill_param(iph->af, iph->proto, &iph->saddr, &iph->daddr, ports[0], ports[1], 0, ¶m); } else { dp_vs_conn_fill_param(iph->af, iph->proto, &iph->saddr, &iph->daddr, ports[0], ports[1], 0, ¶m); } } conn = dp_vs_conn_new(mbuf, ¶m, dest, is_synproxy_on ? DPVS_CONN_F_SYNPROXY : 0); if (!conn) { if (dest->fwdmode == DPVS_FWD_MODE_SNAT && iph->proto != IPPROTO_ICMP) sa_release(NULL, &daddr, &saddr); return NULL; } dp_vs_stats_conn(conn); return conn; }
恩,继续看代码,其实注释的也挺清楚的——通过service的scheduler函数,选择一个RS(real server), 然后新建一个连接。该函数首先调用了 dest = svc->scheduler->schedule(svc, mbuf); 这句话的意思也很简单,按照service的调度规则,从中选择一个dest(也就是一台RS)。svc的结构体叫struct dp_vs_service,其中有个成员struct dp_vs_scheduler *scheduler,看一下这个结构体:
struct dp_vs_scheduler { struct list_head n_list; char *name; // rte_atomic32_t refcnt; struct dp_vs_dest * (*schedule)(struct dp_vs_service *svc, const struct rte_mbuf *mbuf); int (*init_service)(struct dp_vs_service *svc); int (*exit_service)(struct dp_vs_service *svc); int (*update_service)(struct dp_vs_service *svc); } __rte_cache_aligned;
也就是说不同的dp_vs_scheduler 对象,是有不同的调度方式的。这里看起来是不是和前面两篇中的job,pkt_type有些相似呢?在dpvs中这个scheduler有3中:dp_vs_rr_scheduler,
static struct dp_vs_scheduler dp_vs_rr_scheduler = { .name = "rr", /* name */ // .refcnt = ATOMIC_INIT(0), .n_list = LIST_HEAD_INIT(dp_vs_rr_scheduler.n_list), .init_service = dp_vs_rr_init_svc, .update_service = dp_vs_rr_update_svc, .schedule = dp_vs_rr_schedule, }; static struct dp_vs_scheduler dp_vs_wlc_scheduler = { .name = "wlc", .n_list = LIST_HEAD_INIT(dp_vs_wlc_scheduler.n_list), .schedule = dp_vs_wlc_schedule, }; static struct dp_vs_scheduler dp_vs_wrr_scheduler = { .name = "wrr", .n_list = LIST_HEAD_INIT(dp_vs_wrr_scheduler.n_list), .init_service = dp_vs_wrr_init_svc, .exit_service = dp_vs_wrr_done_svc, .update_service = dp_vs_wrr_update_svc, .schedule = dp_vs_wrr_schedule, };
这个注册调用是dpvs_init->dp_vs_sched_init,在dp_vs_sched_init中分别调用了dp_vs_rr_init, dp_vs_wrr_init以及dp_vs_wlc_init分别对这三个scheduler进行了注册。具体的调度策略我们就不详述了。之所以追溯这些代码,其实是想将dpvs的一些代码串联起来。一般在我们刚看dpvs的代码的时候,main函数就是一堆init,然后就是数据平面与控制平面线程启动。刚看起来可能会一脸懵逼,但是从数据平面线程看,往后一直追溯,就会发现,前面的初始化都会在后面有用到。
再回到dp_vs_schedule这个函数,当我们找到一个dest后,然后会有很多dp_vs_conn_fill_param这个函数的调用,dp_vs_conn_fill_param主要就是填充最后一个参数param。这个参数保存了源ip端口和目的ip端口信息。这个参数在dp_vs_conn_new会用到。
struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, struct dp_vs_conn_param *param, struct dp_vs_dest *dest, uint32_t flags)
dp_vs_conn_new这个函数比较长,就不都贴出来了,大体讲下其中的逻辑就行了。从这个函数的定义就可以看出,函数最后会构建了一个struct dp_vs_conn对象,这个结构体里面保存有连接的源和目的ip,port。此外这个函数中还会保存inboud和outbound对应的地址端口信息。以snat为例,client发包给dpvs,dpvs转发给RS,处理完成之后RS会发包到dpvs,再由dpvs转发到client,那么在dpvs这个层面是需要保存这个对应关系的.还需说明dp_vs_conn这个结构体中,有两个成员变量packet_xmit,packet_out_xmit函数指针,一个是转发到RS,一个是转发给client。这两个函数指针真正的赋值是在dp_vs_conn_new中调用的conn_bind_dest中完成的。不同的转发模式赋值不同。
到此整个dpvs连接的就算完了,回到第二篇<dpvs源码分析(续)>最后的dp_vs_in函数,连接建立完了之后,如果是client发包过来,会调用xmit_inbound,xmit_inbound函数又会调用到dp_vs_conn::packet_xmit进行真正的转发。
原创声明,本文系作者授权云+社区发表,未经许可,不得转载。
如有侵权,请联系 yunjia_community@tencent.com 删除。
我来说两句