在上一篇<dpvs源码分析(续)>中,我们以tcp为例,讲到了连接的建立,同时也提到了full-nat,snat这些术语。在该篇中,我们再来讲讲连接建立的过程。
static int tcp_conn_sched(struct dp_vs_proto *proto,
const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf,
struct dp_vs_conn **conn,
int *verdict)
{
struct tcphdr *th, _tcph;
struct dp_vs_service *svc;
assert(proto && iph && mbuf && conn && verdict);
th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
if (unlikely(!th)) {
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
/* Syn-proxy step 2 logic: receive client's 3-handshacke ack packet */
/* When synproxy disabled, only SYN packets can arrive here.
* So don't judge SYNPROXY flag here! If SYNPROXY flag judged, and syn_proxy
* got disbled and keepalived reloaded, SYN packets for RS may never be sent. */
if (dp_vs_synproxy_ack_rcv(iph->af, mbuf, th, proto, conn, iph, verdict) == 0) {
/* Attention: First ACK packet is also stored in conn->ack_mbuf */
return EDPVS_PKTSTOLEN;
}
..................
svc = dp_vs_service_lookup(iph->af, iph->proto,
&iph->daddr, th->dest, 0, mbuf, NULL);
if (!svc) {
*verdict = INET_ACCEPT;
return EDPVS_NOSERV;
}
..............................
*conn = dp_vs_schedule(svc, iph, mbuf, false);
if (!*conn) {
dp_vs_service_put(svc);
*verdict = INET_DROP;
return EDPVS_RESOURCE;
}
dp_vs_service_put(svc);
return EDPVS_OK;
}
代码依然是有删减的,删除的部分用"..................."替代了。但是主体逻辑依然存在。该函数中dp_vs_synproxy_ack_rcv
是tcp握手代理,也就是说client先和部署了dpvs的机器进行3次握手,完了之后,dpvs机器才会和真正的服务器RS去握手,此处细节就不深究了。然后第二个调用是dp_vs_service_lookup,这个函数干嘛呢?看名字大概就能猜出来了——通过传递的参数,协议,ip, port查找service。service是dest的集合,dest就是真正的服务器。找了了svc之后,就可以准备建立连接了。
2,dp_vs_schedule
/* select an RS by service's scheduler and create a connection */
struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc,
const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf,
bool is_synproxy_on)
{
uint16_t _ports[2], *ports; /* sport, dport */
struct dp_vs_dest *dest;
struct dp_vs_conn *conn;
struct dp_vs_conn_param param;
struct sockaddr_in daddr, saddr;
int err;
assert(svc && iph && mbuf);
ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports);
if (!ports)
return NULL;
/* persistent service */
if (svc->flags & DP_VS_SVC_F_PERSISTENT)
return dp_vs_sched_persist(svc, iph, mbuf, is_synproxy_on);
dest = svc->scheduler->schedule(svc, mbuf);
if (!dest) {
RTE_LOG(WARNING, IPVS, "%s: no dest found.\n", __func__);
return NULL;
}
if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
if (unlikely(iph->proto == IPPROTO_ICMP)) {
struct icmphdr *ich, _icmph;
ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph);
if (!ich)
return NULL;
ports = _ports;
_ports[0] = icmp4_id(ich);
_ports[1] = ich->type << 8 | ich->code;
/* ID may confict for diff host,
* need we use ID pool ? */
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->daddr, &dest->addr,
ports[1], ports[0],
0, ¶m);
} else {
/* we cannot inherit dest (host's src port),
* that may confict for diff hosts,
* and using dest->port is worse choice. */
memset(&daddr, 0, sizeof(daddr));
daddr.sin_family = AF_INET;
daddr.sin_addr = iph->daddr.in;
daddr.sin_port = ports[1];
memset(&saddr, 0, sizeof(saddr));
saddr.sin_family = AF_INET;
saddr.sin_addr = dest->addr.in;
saddr.sin_port = 0;
err = sa_fetch(NULL, &daddr, &saddr);
if (err != 0)
return NULL;
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->daddr, &dest->addr,
ports[1], saddr.sin_port,
0, ¶m);
}
} else {
if (unlikely(iph->proto == IPPROTO_ICMP)) {
struct icmphdr *ich, _icmph;
ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph);
if (!ich)
return NULL;
ports = _ports;
_ports[0] = icmp4_id(ich);
_ports[1] = ich->type << 8 | ich->code;
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->saddr, &iph->daddr,
ports[0], ports[1], 0, ¶m);
} else {
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->saddr, &iph->daddr,
ports[0], ports[1], 0, ¶m);
}
}
conn = dp_vs_conn_new(mbuf, ¶m, dest,
is_synproxy_on ? DPVS_CONN_F_SYNPROXY : 0);
if (!conn) {
if (dest->fwdmode == DPVS_FWD_MODE_SNAT && iph->proto != IPPROTO_ICMP)
sa_release(NULL, &daddr, &saddr);
return NULL;
}
dp_vs_stats_conn(conn);
return conn;
}
恩,继续看代码,其实注释的也挺清楚的——通过service的scheduler函数,选择一个RS(real server), 然后新建一个连接。该函数首先调用了 dest = svc->scheduler->schedule(svc, mbuf); 这句话的意思也很简单,按照service的调度规则,从中选择一个dest(也就是一台RS)。svc的结构体叫struct dp_vs_service,其中有个成员struct dp_vs_scheduler *scheduler,看一下这个结构体:
struct dp_vs_scheduler {
struct list_head n_list;
char *name;
// rte_atomic32_t refcnt;
struct dp_vs_dest *
(*schedule)(struct dp_vs_service *svc,
const struct rte_mbuf *mbuf);
int (*init_service)(struct dp_vs_service *svc);
int (*exit_service)(struct dp_vs_service *svc);
int (*update_service)(struct dp_vs_service *svc);
} __rte_cache_aligned;
也就是说不同的dp_vs_scheduler 对象,是有不同的调度方式的。这里看起来是不是和前面两篇中的job,pkt_type有些相似呢?在dpvs中这个scheduler有3中:dp_vs_rr_scheduler,
static struct dp_vs_scheduler dp_vs_rr_scheduler = {
.name = "rr", /* name */
// .refcnt = ATOMIC_INIT(0),
.n_list = LIST_HEAD_INIT(dp_vs_rr_scheduler.n_list),
.init_service = dp_vs_rr_init_svc,
.update_service = dp_vs_rr_update_svc,
.schedule = dp_vs_rr_schedule,
};
static struct dp_vs_scheduler dp_vs_wlc_scheduler = {
.name = "wlc",
.n_list = LIST_HEAD_INIT(dp_vs_wlc_scheduler.n_list),
.schedule = dp_vs_wlc_schedule,
};
static struct dp_vs_scheduler dp_vs_wrr_scheduler = {
.name = "wrr",
.n_list = LIST_HEAD_INIT(dp_vs_wrr_scheduler.n_list),
.init_service = dp_vs_wrr_init_svc,
.exit_service = dp_vs_wrr_done_svc,
.update_service = dp_vs_wrr_update_svc,
.schedule = dp_vs_wrr_schedule,
};
这个注册调用是dpvs_init->dp_vs_sched_init,在dp_vs_sched_init中分别调用了dp_vs_rr_init, dp_vs_wrr_init以及dp_vs_wlc_init分别对这三个scheduler进行了注册。具体的调度策略我们就不详述了。之所以追溯这些代码,其实是想将dpvs的一些代码串联起来。一般在我们刚看dpvs的代码的时候,main函数就是一堆init,然后就是数据平面与控制平面线程启动。刚看起来可能会一脸懵逼,但是从数据平面线程看,往后一直追溯,就会发现,前面的初始化都会在后面有用到。
再回到dp_vs_schedule这个函数,当我们找到一个dest后,然后会有很多dp_vs_conn_fill_param这个函数的调用,dp_vs_conn_fill_param主要就是填充最后一个参数param。这个参数保存了源ip端口和目的ip端口信息。这个参数在dp_vs_conn_new会用到。
struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf,
struct dp_vs_conn_param *param,
struct dp_vs_dest *dest, uint32_t flags)
dp_vs_conn_new这个函数比较长,就不都贴出来了,大体讲下其中的逻辑就行了。从这个函数的定义就可以看出,函数最后会构建了一个struct dp_vs_conn对象,这个结构体里面保存有连接的源和目的ip,port。此外这个函数中还会保存inboud和outbound对应的地址端口信息。以snat为例,client发包给dpvs,dpvs转发给RS,处理完成之后RS会发包到dpvs,再由dpvs转发到client,那么在dpvs这个层面是需要保存这个对应关系的.还需说明dp_vs_conn这个结构体中,有两个成员变量packet_xmit,packet_out_xmit函数指针,一个是转发到RS,一个是转发给client。这两个函数指针真正的赋值是在dp_vs_conn_new中调用的conn_bind_dest中完成的。不同的转发模式赋值不同。
到此整个dpvs连接的就算完了,回到第二篇<dpvs源码分析(续)>最后的dp_vs_in函数,连接建立完了之后,如果是client发包过来,会调用xmit_inbound,xmit_inbound函数又会调用到dp_vs_conn::packet_xmit进行真正的转发。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。