本文为 MIT 6.S081 2020 操作系统 实验十一解析。
MIT 6.S081课程前置基础参考: 基于RISC-V搭建操作系统系列
上一节我们总体介绍了一下手册中有关数据接收和传输的章节,本节借助上节的基础来完成lab的具体代码实现。
void
pci_init()
{
// we'll place the e1000 registers at this address.
// vm.c maps this range.
// e1000寄存器的地址
uint64 e1000_regs = 0x40000000L;
// qemu -machine virt puts PCIe config space here.
// vm.c maps this range.
// PCI配置空间的地址
uint32 *ecam = (uint32 *) 0x30000000L;
// look at each possible PCI device on bus 0.
// "bus 0"指的是PCI总线上的第一个总线
// 使用循环遍历PCI总线上每个设备
for(int dev = 0; dev < 32; dev++){
int bus = 0;
int func = 0;
int offset = 0;
// 构造访问PCI配置空间的偏移量(offset)
uint32 off = (bus << 16) | (dev << 11) | (func << 8) | (offset);
// 通过将基地址(ecam)与偏移量相加,得到指向设备的寄存器的指针(base)
volatile uint32 *base = ecam + off;
// 读取设备的ID
uint32 id = base[0];
// 100e:8086 is an e1000
// 判断是否为e1000网卡
if(id == 0x100e8086){
// command and status register.
// bit 0 : I/O access enable
// bit 1 : memory access enable
// bit 2 : enable mastering
// 使能I/O访问,内存访问和主控
base[1] = 7;
__sync_synchronize();
// 对设备的BAR(Base Address Registers)进行处理,通过向BAR写入全1的值,然后再恢复原来的值,从而获取BAR的大小
// 通过将全1的值写入BAR寄存器,促使设备对BAR进行处理,并返回BAR对应资源的大小。
// 设备在接收到全1的值后,会将其替换为对应资源的大小值。将原始值重新写回BAR寄存器是为了保持原有的基地址信息。
for(int i = 0; i < 6; i++){
uint32 old = base[4+i];
// writing all 1's to the BAR causes it to be
// replaced with its size.
base[4+i] = 0xffffffff;
__sync_synchronize();
base[4+i] = old;
}
// tell the e1000 to reveal its registers at
// physical address 0x40000000.
// 将e1000寄存器的物理地址设置给e1000网卡的BAR,使其将寄存器映射到该地址
base[4+0] = e1000_regs;
// 对e1000网卡进行初始化,传递e1000寄存器的地址作为参数
e1000_init((uint32*)e1000_regs);
}
}
}
// called by pci_init().
// xregs is the memory address at which the
// e1000's registers are mapped.
void
e1000_init(uint32 *xregs)
{
int i;
initlock(&e1000_lock, "e1000");
// 拿到e1000寄存器的内存地址
regs = xregs;
// Reset the device
// 重置设备: 禁用中断->触发设置重置操作->再次禁用中断
regs[E1000_IMS] = 0; // disable interrupts
regs[E1000_CTL] |= E1000_CTL_RST;
regs[E1000_IMS] = 0; // redisable interrupts
__sync_synchronize();
// [E1000 14.5] Transmit initialization
// 传输初始化
// 1.tx_ring传输描述符数组清空
memset(tx_ring, 0, sizeof(tx_ring));
// 2.设置每个传输描述符状态为可用
for (i = 0; i < TX_RING_SIZE; i++) {
tx_ring[i].status = E1000_TXD_STAT_DD;
tx_mbufs[i] = 0;
}
// 将传输描述符数组基地址赋值给传输描述符基地址寄存器
regs[E1000_TDBAL] = (uint64) tx_ring;
if(sizeof(tx_ring) % 128 != 0)
panic("e1000");
// 设置传输描述符长度寄存器
regs[E1000_TDLEN] = sizeof(tx_ring);
// 设置传输描述符头寄存器和尾寄存器
regs[E1000_TDH] = regs[E1000_TDT] = 0;
// [E1000 14.4] Receive initialization
// 接收初始化
// 1.rx_ring接收描述符数组清空
memset(rx_ring, 0, sizeof(rx_ring));
// 2.为每个接收描述符每个一个mbuf
for (i = 0; i < RX_RING_SIZE; i++) {
rx_mbufs[i] = mbufalloc(0);
if (!rx_mbufs[i])
panic("e1000");
rx_ring[i].addr = (uint64) rx_mbufs[i]->head;
}
// 将接收描述符数组基地址赋值给接收描述符基地址寄存器
regs[E1000_RDBAL] = (uint64) rx_ring;
if(sizeof(rx_ring) % 128 != 0)
panic("e1000");
// 设置接收描述符头寄存器,尾寄存器和长度寄存器
regs[E1000_RDH] = 0;
regs[E1000_RDT] = RX_RING_SIZE - 1;
regs[E1000_RDLEN] = sizeof(rx_ring);
// 下面是设置Filter Registers,也就是数据接收前,网卡相关filter会进行过滤
// filter by qemu's MAC address, 52:54:00:12:34:56
// 通过将指定的MAC地址写入RA寄存器和RA+1寄存器,可以配置网络接口控制器只接收特定MAC地址的数据包
regs[E1000_RA] = 0x12005452;
regs[E1000_RA+1] = 0x5634 | (1<<31);
// multicast table
// 循环将MTA寄存器中的值设置为零来清除多播表的所有条目
// 这意味着网络接口控制器将不会处理任何多播数据包,因为多播表中的所有条目都被清除
for (int i = 0; i < 4096/32; i++)
regs[E1000_MTA + i] = 0;
// transmitter control bits.
// 设置传输控制寄存器
regs[E1000_TCTL] =
// 当将传输器控制寄存器(TCTL)中的使能位(EN)设置为1时,传输器将启用。如果将该位设置为0,传输器将在发送完正在进行的数据包后停止传输。
// 数据将保留在传输FIFO中,直到设备重新启用。如果希望清空传输FIFO中的数据包,软件应将此操作与复位操作结合使用。
E1000_TCTL_EN | // enable
// 根据 "Pad Short Packets" 位的设置,可以决定是否对短数据包进行填充,以达到最小数据包长度的要求
E1000_TCTL_PSP | // pad short packets
// "Collision Threshold" 是一个决定重新传输尝试次数的参数。当发送数据包时,如果发生了冲突(即多个设备同时发送数据导致冲突),则将根据冲突阈值进行重传
(0x10 << E1000_TCTL_CT_SHIFT) | // collision stuff
// 为了使CSMA/CD操作能够有效工作,数据包在传输过程中会进行填充,以确保在发生冲突时能够检测到。填充使用的是特殊符号而不是有效的数据字节
(0x40 << E1000_TCTL_COLD_SHIFT);
// 设置了传输器间隙寄存器的值,以确定发送数据包之间的间隙时间
regs[E1000_TIPG] = 10 | (8<<10) | (6<<20); // inter-pkt gap
// receiver control bits.
regs[E1000_RCTL] =
// 当将接收器使能位(Receiver Enable)设置为1时,接收器将启用。
// 如果将该位设置为0,则在接收任何正在进行的数据包后停止接收。数据将保留在接收FIFO中,直到设备重新启用。
E1000_RCTL_EN | // enable receiver
// 设置为1时,网络接口控制器将接收并传递所有接收到的广播数据包,而不进行过滤操作
// 设置为0时,网络接口控制器只有在广播数据包通过过滤器匹配时,才会接受或拒绝广播数据包
E1000_RCTL_BAM | // enable broadcast
// 控制接收缓冲区的大小的参数
E1000_RCTL_SZ_2048 | // 2048-byte rx buffers
// 是否从接收的数据包中去除以太网CRC字段
E1000_RCTL_SECRC; // strip CRC
// ask e1000 for receive interrupts.
// 将接收延迟时间寄存器(Receive Delay Timer Register,RDTR)设置为0。这意味着在每接收到一个数据包后,将立即触发接收中断,而不使用定时器来控制中断触发。
regs[E1000_RDTR] = 0; // interrupt after every received packet (no timer)
// 将接收中断延迟寄存器(Receive Interrupt Delay Register,RADV)设置为0。这意味着在每个数据包后立即触发接收中断,而不使用定时器来控制中断触发。
regs[E1000_RADV] = 0; // interrupt after every packet (no timer)
// 接收描述符写回操作触发中断
regs[E1000_IMS] = (1 << 7); // RXDW -- Receiver Descriptor Write Back
}
// kernel/net.h
#define MBUF_SIZE 2048
struct mbuf {
struct mbuf *next; // the next mbuf in the chain
char *head; // the current start position of the buffer
unsigned int len; // the length of the buffer
char buf[MBUF_SIZE]; // the backing store
};
// Allocates a packet buffer.
// 分配一个空闲数据包缓冲区
struct mbuf *
mbufalloc(unsigned int headroom)
{
struct mbuf *m;
if (headroom > MBUF_SIZE)
return 0;
// 分配一个空闲物理页
m = kalloc();
if (m == 0)
return 0;
m->next = 0;
m->head = (char *)m->buf + headroom;
m->len = 0;
memset(m->buf, 0, sizeof(m->buf));
return m;
}
// Frees a packet buffer.
void
mbuffree(struct mbuf *m)
{
kfree(m);
}
// kernel/e1000_dev.h
// [E1000 3.3.3] -- 每个字段具体含义可参考上一小节
struct tx_desc
{
uint64 addr;
uint16 length;
uint8 cso;
uint8 cmd;
uint8 status;
uint8 css;
uint16 special;
};
// kernel/e1000_dev.h
// [E1000 3.2.3] -- 每个字段具体含义可参考上一小节
struct rx_desc
{
uint64 addr; /* Address of the descriptor's data buffer */
uint16 length; /* Length of data DMAed into data buffer */
uint16 csum; /* Packet checksum */
uint8 status; /* Descriptor status */
uint8 errors; /* Descriptor Errors */
uint16 special;
};
// kernel/e1000_dev.h
#define TX_RING_SIZE 16
static struct tx_desc tx_ring[TX_RING_SIZE] __attribute__((aligned(16)));
static struct mbuf *tx_mbufs[TX_RING_SIZE];
#define RX_RING_SIZE 16
static struct rx_desc rx_ring[RX_RING_SIZE] __attribute__((aligned(16)));
static struct mbuf *rx_mbufs[RX_RING_SIZE];
本实验的关键在于理解 E1000 网卡硬件与驱动软件的交互逻辑以及相关的数据结构,实验要求实现的两个函数:
在驱动的数据结构中, 主要为发送和接收数据的两个循环队列. 其中每个队列实际上又分为描述符队列和缓冲区指针队列, 缓冲区队列依附于描述符队列, 同时有网卡寄存器记录着队列的首尾指针, 这种设计遵循传统的驱动设计方案, 首指针由硬件管理, 尾指针有软件管理, 二者通过队列满足并发的需要.
待实现的 e1000_transmit() 函数用于将要发送的数据放入循环队列尾部, 后续由网卡硬件将数据包进行发送:
待实现的 e1000_recv() 函数则是由网卡的中断处理程序 e1000_intr() 调用:
待实现的两个函数的作用即驱动程序将以太网帧写入驱动的发送队列, 或是将以太网帧从驱动的接收队列取出并解封装.
实现 kernel/e1000.c 中的 e1000_transmit() 函数, 以完成发送以太网数据帧到网卡的工作:
其中, 最需要关注的是发送队列的首(E1000_TDH)尾(E1000_TDT)指针:
具体 e1000_transmit() 的实现, 可以按照实验指导的提示进行完成:
int
e1000_transmit(struct mbuf *m)
{
//
// Your code here.
//
// the mbuf contains an ethernet frame; program it into
// the TX descriptor ring so that the e1000 sends it. Stash
// a pointer so that it can be freed after sending.
//
uint32 tail;
struct tx_desc *desc;
acquire(&e1000_lock);
tail = regs[E1000_TDT];
desc = &tx_ring[tail];
// check if the ring is overflowing
if ((desc->status & E1000_TXD_STAT_DD) == 0) {
release(&e1000_lock);
return -1;
}
// free the last mbuf that was transmitted
if (tx_mbufs[tail]) {
mbuffree(tx_mbufs[tail]);
}
// fill in the descriptor
desc->addr = (uint64) m->head;
desc->length = m->len;
// 当前为最后一个数据包
desc->cmd = E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS;
tx_mbufs[tail] = m;
// a barrier to prevent reorder of instructions
__sync_synchronize();
regs[E1000_TDT] = (tail + 1) % TX_RING_SIZE;
release(&e1000_lock);
return 0;
}
实现 kernel/e1000.c 中的 e1000_recv() 函数, 以完成从网卡接收数据到内核的工作.
网卡接收数据同样有接收队列 rx_ring 及其缓冲区 rx_mbufs. 逻辑与发送队列基本相同, 具体可见开发手册 3.2.6 节.
具体 e1000_recv() 的实现, 同样可以按照实验指导的提示进行完成:
static void
e1000_recv(void)
{
//
// Your code here.
//
// Check for packets that have arrived from the e1000
// Create and deliver an mbuf for each packet (using net_rx()).
//
int tail = (regs[E1000_RDT] + 1) % RX_RING_SIZE;
struct rx_desc *desc = &rx_ring[tail];
while ((desc->status & E1000_RXD_STAT_DD)) {
if(desc->length > MBUF_SIZE) {
panic("e1000 len");
}
// update the length reported in the descriptor.
rx_mbufs[tail]->len = desc->length;
// deliver the mbuf to the network stack
net_rx(rx_mbufs[tail]);
// allocate a new mbuf replace the one given to net_rx()
rx_mbufs[tail] = mbufalloc(0);
if (!rx_mbufs[tail]) {
panic("e1000 no mubfs");
}
desc->addr = (uint64) rx_mbufs[tail]->head;
desc->status = 0;
tail = (tail + 1) % RX_RING_SIZE;
desc = &rx_ring[tail];
}
regs[E1000_RDT] = (tail - 1) % RX_RING_SIZE;
}
本节,我们来完整过一遍数据包发送的完整流程,从nettest测试程序的ping函数开始:
// user/nettest.c
// send a UDP packet to the localhost (outside of qemu),
// and receive a response.
static void
ping(uint16 sport, uint16 dport, int attempts)
{
int fd;
char *obuf = "a message from xv6!";
uint32 dst;
// 10.0.2.2, which qemu remaps to the external host,
// i.e. the machine you're running qemu on.
// 目的IP地址
dst = (10 << 24) | (0 << 16) | (2 << 8) | (2 << 0);
// you can send a UDP packet to any Internet address
// by using a different dst.
// 建立UDP连接--目的ip地址,源端口和目的端口--返回对应socket文件的fd
if((fd = connect(dst, sport, dport)) < 0){
fprintf(2, "ping: connect() failed\n");
exit(1);
}
for(int i = 0; i < attempts; i++) {
// 向UDP Socket写入数据
if(write(fd, obuf, strlen(obuf)) < 0){
fprintf(2, "ping: send() failed\n");
exit(1);
}
}
...
}
// kernel/sysfile.c
int
sys_connect(void)
{
struct file *f;
int fd;
uint32 raddr;
uint32 rport;
uint32 lport;
// 获取系统调用参数: 目的IP地址,源端口和目的端口
if (argint(0, (int*)&raddr) < 0 ||
argint(1, (int*)&lport) < 0 ||
argint(2, (int*)&rport) < 0) {
return -1;
}
// 分配一个空闲socket
if(sockalloc(&f, raddr, lport, rport) < 0)
return -1;
// 从当前进程fd列表中分配一个空闲fd
if((fd=fdalloc(f)) < 0){
fileclose(f);
return -1;
}
return fd;
}
scokalloc用于分配一个新的socket对象:
// kernel/sysnet.c
struct sock {
struct sock *next; // the next socket in the list
uint32 raddr; // the remote IPv4 address
uint16 lport; // the local UDP port number
uint16 rport; // the remote UDP port number
struct spinlock lock; // protects the rxq
struct mbufq rxq; // a queue of packets waiting to be received
};
static struct spinlock lock;
static struct sock *sockets;
// socket文件指针,目的ip地址,源端口,目的端口
int
sockalloc(struct file **f, uint32 raddr, uint16 lport, uint16 rport)
{
struct sock *si, *pos;
si = 0;
*f = 0;
// 从全局文件列表中分配一个空闲文件
if ((*f = filealloc()) == 0)
goto bad;
// 分配一个新的socket
if ((si = (struct sock*)kalloc()) == 0)
goto bad;
// initialize objects
// 将通信信息赋值给socket
si->raddr = raddr;
si->lport = lport;
si->rport = rport;
initlock(&si->lock, "sock");
// 初始化当前socket用于暂存数据包的队列
mbufq_init(&si->rxq);
// 设置文件类型为SOCK,可读可写
(*f)->type = FD_SOCK;
(*f)->readable = 1;
(*f)->writable = 1;
// 设置文件的socket实例位si
(*f)->sock = si;
// add to list of sockets
acquire(&lock);
pos = sockets;
// 这里是检查三元组是否重复<目的IP地址,源端口,目的端口>
while (pos) {
if (pos->raddr == raddr &&
pos->lport == lport &&
pos->rport == rport) {
release(&lock);
goto bad;
}
pos = pos->next;
}
// 将新分配的socket加入链表
si->next = sockets;
sockets = si;
release(&lock);
return 0;
bad:
if (si)
kfree((char*)si);
if (*f)
fileclose(*f);
return -1;
}
sockwrite用于向socket写入数据:
// 数据包header头部默认占据的大小
#define MBUF_DEFAULT_HEADROOM 128
int
sockwrite(struct sock *si, uint64 addr, int n)
{
struct proc *pr = myproc();
struct mbuf *m;
// 分配一个空闲数据包缓冲区
m = mbufalloc(MBUF_DEFAULT_HEADROOM);
if (!m)
return -1;
// 向mbuf缓冲区中写入数据,数据来自用户态虚拟地址addr,写入数据长度为n
if (copyin(pr->pagetable, mbufput(m, n), addr, n) == -1) {
mbuffree(m);
return -1;
}
// 将数据包传递给UDP处理函数
net_tx_udp(m, si->raddr, si->lport, si->rport);
return n;
}
// Allocates a packet buffer.
// 分配一个空闲数据包缓冲区
struct mbuf *
mbufalloc(unsigned int headroom)
{
struct mbuf *m;
if (headroom > MBUF_SIZE)
return 0;
// 分配一个空闲物理页
m = kalloc();
if (m == 0)
return 0;
m->next = 0;
m->head = (char *)m->buf + headroom;
m->len = 0;
memset(m->buf, 0, sizeof(m->buf));
return m;
}
// Appends data to the end of the buffer and returns a pointer to it.
char *
mbufput(struct mbuf *m, unsigned int len)
{
// 当前mbuf缓存区数据起始写入位置=数据包头占据大小+已有数据占据大小
char *tmp = m->head + m->len;
// len用于记录当前mbuf缓冲区内数据量大小
m->len += len;
if (m->len > MBUF_SIZE)
panic("mbufput");
return tmp;
}
#define mbufpushhdr(mbuf, hdr) (typeof(hdr)*)mbufpush(mbuf, sizeof(hdr))
// a UDP packet header (comes after an IP header).
struct udp {
uint16 sport; // source port
uint16 dport; // destination port
uint16 ulen; // length, including udp header, not including IP header
uint16 sum; // checksum
};
// sends a UDP packet
void
net_tx_udp(struct mbuf *m, uint32 dip,
uint16 sport, uint16 dport)
{
struct udp *udphdr;
// put the UDP header -- 填入UDP头部
// mbufpushhdr是宏定义,上面已经给出
udphdr = mbufpushhdr(m, *udphdr);
udphdr->sport = htons(sport);
udphdr->dport = htons(dport);
udphdr->ulen = htons(m->len);
udphdr->sum = 0; // zero means no checksum is provided
// now on to the IP layer --
// 将填充好UDP头部的数据包传递给IP层处理
// mbuf,本层使用UDP协议,目的ip地址
net_tx_ip(m, IPPROTO_UDP, dip);
}
// Prepends data to the beginning of the buffer and returns a pointer to it.
char *
mbufpush(struct mbuf *m, unsigned int len)
{
m->head -= len;
if (m->head < m->buf)
panic("mbufpush");
m->len += len;
return m->head;
}
// an IP packet header (comes after an Ethernet header).
struct ip {
uint8 ip_vhl; // version << 4 | header length >> 2
uint8 ip_tos; // type of service
uint16 ip_len; // total length
uint16 ip_id; // identification
uint16 ip_off; // fragment offset field
uint8 ip_ttl; // time to live
uint8 ip_p; // protocol
uint16 ip_sum; // checksum
uint32 ip_src, ip_dst;
};
// sends an IP packet
static void
net_tx_ip(struct mbuf *m, uint8 proto, uint32 dip)
{
struct ip *iphdr;
// push the IP header -- 填充ip头部
iphdr = mbufpushhdr(m, *iphdr);
memset(iphdr, 0, sizeof(*iphdr));
// 设置IP版本和头部长度字段
iphdr->ip_vhl = (4 << 4) | (20 >> 2);
// 设置IP数据报上层使用的协议
iphdr->ip_p = proto;
// 设置源ip和目的ip
iphdr->ip_src = htonl(local_ip);
iphdr->ip_dst = htonl(dip);
// 设置data payload长度
iphdr->ip_len = htons(m->len);
// 设置IP数据报的生存时间(TTL)字段
iphdr->ip_ttl = 100;
// 计算并设置IP头部的校验和字段。校验和用于检测IP头部是否在传输过程中发生了错误或损坏
iphdr->ip_sum = in_cksum((unsigned char *)iphdr, sizeof(*iphdr));
// now on to the ethernet layer
// 将填充好ip头的mbuf传递到数据链路层,并告诉下一层,本层使用IP协议
net_tx_eth(m, ETHTYPE_IP);
}
static uint8 local_mac[ETHADDR_LEN] = { 0x52, 0x54, 0x00, 0x12, 0x34, 0x56 };
static uint8 broadcast_mac[ETHADDR_LEN] = { 0xFF, 0XFF, 0XFF, 0XFF, 0XFF, 0XFF };
// an Ethernet packet header (start of the packet).
struct eth {
uint8 dhost[ETHADDR_LEN];
uint8 shost[ETHADDR_LEN];
uint16 type;
} __attribute__((packed));
// sends an ethernet packet
static void
net_tx_eth(struct mbuf *m, uint16 ethtype)
{
struct eth *ethhdr;
// 填充以太网头部
ethhdr = mbufpushhdr(m, *ethhdr);
// 将本地MAC地址复制到以太网头部的源MAC地址字段
memmove(ethhdr->shost, local_mac, ETHADDR_LEN);
// In a real networking stack, dhost would be set to the address discovered
// through ARP. Because we don't support enough of the ARP protocol, set it
// to broadcast instead.
// 在真实的网络协议栈中,dhost应该被设置为通过ARP协议发现的目标设备的MAC地址。
// 由于本示例中并不完全支持ARP协议,将dhost设置为广播地址(broadcast)。
memmove(ethhdr->dhost, broadcast_mac, ETHADDR_LEN);
// 设置上层协议类型
ethhdr->type = htons(ethtype);
// 调用e1000驱动函数,传输mbuf
if (e1000_transmit(m)) {
// 传输成功后,释放当前mbuf
mbuffree(m);
}
}
还是继续从nettest.c文件中的ping函数开始继续往下阅读:
//
// send a UDP packet to the localhost (outside of qemu),
// and receive a response.
//
static void
ping(uint16 sport, uint16 dport, int attempts)
{
int fd;
char *obuf = "a message from xv6!";
uint32 dst;
// 10.0.2.2, which qemu remaps to the external host,
// i.e. the machine you're running qemu on.
dst = (10 << 24) | (0 << 16) | (2 << 8) | (2 << 0);
// you can send a UDP packet to any Internet address
// by using a different dst.
if((fd = connect(dst, sport, dport)) < 0){
fprintf(2, "ping: connect() failed\n");
exit(1);
}
...
char ibuf[128];
// 从socket文件读取数据到ibuf中
int cc = read(fd, ibuf, sizeof(ibuf)-1);
if(cc < 0){
fprintf(2, "ping: recv() failed\n");
exit(1);
}
// 关闭socket文件
close(fd);
ibuf[cc] = '\0';
if(strcmp(ibuf, "this is the host!") != 0){
fprintf(2, "ping didn't receive correct payload\n");
exit(1);
}
}
int
sockread(struct sock *si, uint64 addr, int n)
{
struct proc *pr = myproc();
struct mbuf *m;
int len;
acquire(&si->lock);
// 当前socket关联的mbuf队列为空,并且当前进程没有被Kill,则进行sleep状态
while (mbufq_empty(&si->rxq) && !pr->killed) {
sleep(&si->rxq, &si->lock);
}
if (pr->killed) {
release(&si->lock);
return -1;
}
// 从mbuf队列弹出头部元素
m = mbufq_pophead(&si->rxq);
release(&si->lock);
// n是期望读取字节数,len是当前mbuf中存在的字节数
len = m->len;
// 如果mbuf中存在的数据量比n大,那么缩小len到n
if (len > n)
len = n;
// 从mbuf中拷贝len个字节数据到用户态虚拟地址addr处
if (copyout(pr->pagetable, addr, m->head, len) == -1) {
mbuffree(m);
return -1;
}
// 当前mbuf使用完了,释放他
mbuffree(m);
return len;
}
void
e1000_intr(void)
{
// 告知e1000我们已经处理了该中断;
// 如果不执行此操作,e1000将不会触发任何进一步的中断。
regs[E1000_ICR] = 0xffffffff;
// 触发收包操作--然后将接收到的数据包向上层传递
e1000_recv();
}
#define mbufpullhdr(mbuf, hdr) (typeof(hdr)*)mbufpull(mbuf, sizeof(hdr))
// called by e1000 driver's interrupt handler to deliver a packet to the
// networking stack
void net_rx(struct mbuf *m)
{
struct eth *ethhdr;
uint16 type;
// 剥离以太网头部
ethhdr = mbufpullhdr(m, *ethhdr);
if (!ethhdr) {
mbuffree(m);
return;
}
// 拿到上一层协议类型,通过类型进行分发
type = ntohs(ethhdr->type);
// 如果是ip协议,分发给ip处理函数
if (type == ETHTYPE_IP)
net_rx_ip(m);
// 如果是ARP协议,分发给ARP处理函数
else if (type == ETHTYPE_ARP)
net_rx_arp(m);
else
// 无法识别的协议
mbuffree(m);
}
// Strips data from the start of the buffer and returns a pointer to it.
// Returns 0 if less than the full requested length is available.
char *
mbufpull(struct mbuf *m, unsigned int len)
{
char *tmp = m->head;
if (m->len < len)
return 0;
m->len -= len;
m->head += len;
return tmp;
}
3. 剥离了以太网头部的mbuf继续往上层传递
// receives an IP packet
static void
net_rx_ip(struct mbuf *m)
{
struct ip *iphdr;
uint16 len;
// 剥离ip头部
iphdr = mbufpullhdr(m, *iphdr);
if (!iphdr)
goto fail;
// check IP version and header len
// 判断是否是IP V4,以及IP头部字节数是否符合要求
if (iphdr->ip_vhl != ((4 << 4) | (20 >> 2)))
goto fail;
// validate IP checksum
// 校验IP头部是否损坏
if (in_cksum((unsigned char *)iphdr, sizeof(*iphdr)))
goto fail;
// can't support fragmented IP packets
// 不支持IP分片重组
if (htons(iphdr->ip_off) != 0)
goto fail;
// is the packet addressed to us?
// 当前数据包的ip目的地址是否是本主机
if (htonl(iphdr->ip_dst) != local_ip)
goto fail;
// can only support UDP
// 判断上一层协议是否为UDP,这里只支持UDP协议
if (iphdr->ip_p != IPPROTO_UDP)
goto fail;
len = ntohs(iphdr->ip_len) - sizeof(*iphdr);
// 将剥离了ip头的数据包传递给UDP处理器处理
net_rx_udp(m, len, iphdr);
return;
fail:
mbuffree(m);
}
4. 剥离了ip头部的mbuf继续往上层传递
// Strips data from the end of the buffer and returns a pointer to it.
// Returns 0 if less than the full requested length is available.
char *
mbuftrim(struct mbuf *m, unsigned int len)
{
if (len > m->len)
return 0;
m->len -= len;
return m->head + m->len;
}
// receives a UDP packet
static void
net_rx_udp(struct mbuf *m, uint16 len, struct ip *iphdr)
{
struct udp *udphdr;
uint32 sip;
uint16 sport, dport;
// 剥离udp头部
udphdr = mbufpullhdr(m, *udphdr);
if (!udphdr)
goto fail;
// TODO: validate UDP checksum --> UDP数据包校验和检验,待完成
// validate lengths reported in headers
// 校验udp数据包大小是否完整
if (ntohs(udphdr->ulen) != len)
goto fail;
len -= sizeof(*udphdr);
if (len > m->len)
goto fail;
// minimum packet size could be larger than the payload
// 剥离udp头部
mbuftrim(m, m->len - len);
// parse the necessary fields
// 解析udp头部相关字段
sip = ntohl(iphdr->ip_src);
sport = ntohs(udphdr->sport);
dport = ntohs(udphdr->dport);
// 将剥离了udp header的数据报继续向上传递
sockrecvudp(m, sip, dport, sport);
return;
fail:
mbuffree(m);
}
// called by protocol handler layer to deliver UDP packets
void
sockrecvudp(struct mbuf *m, uint32 raddr, uint16 lport, uint16 rport)
{
//
// Find the socket that handles this mbuf and deliver it, waking
// any sleeping reader. Free the mbuf if there are no sockets
// registered to handle it.
//
struct sock *si;
acquire(&lock);
// 遍历已经建立的socket连接,挨个对比三元组是否相等
si = sockets;
while (si) {
if (si->raddr == raddr && si->lport == lport && si->rport == rport)
goto found;
si = si->next;
}
release(&lock);
mbuffree(m);
return;
found:
acquire(&si->lock);
// 将mbuf添加到对应socket的mbuf队列中去
mbufq_pushtail(&si->rxq, m);
// 唤醒等待在mbuf队列上的socket
wakeup(&si->rxq);
release(&si->lock);
release(&lock);
}
// Pushes an mbuf to the end of the queue.
void
mbufq_pushtail(struct mbufq *q, struct mbuf *m)
{
m->next = 0;
if (!q->head){
q->head = q->tail = m;
return;
}
q->tail->next = m;
q->tail = m;
}
当数据传输和接收完毕后,需要关闭socket连接:
void
sockclose(struct sock *si)
{
struct sock **pos;
struct mbuf *m;
// remove from list of sockets
acquire(&lock);
pos = &sockets;
// 从已经打开的socket连接中寻找到要关闭的那个
while (*pos) {
if (*pos == si){
*pos = si->next;
break;
}
pos = &(*pos)->next;
}
release(&lock);
// free any pending mbufs
// 将socket关联的mbuf队列中所有mbuf全部释放
while (!mbufq_empty(&si->rxq)) {
m = mbufq_pophead(&si->rxq);
mbuffree(m);
}
// 释放socket对象占据空间
kfree((char*)si);
}
// receives an ARP packet
static void
net_rx_arp(struct mbuf *m)
{
struct arp *arphdr;
uint8 smac[ETHADDR_LEN];
uint32 sip, tip;
// 剥离arp头部
arphdr = mbufpullhdr(m, *arphdr);
if (!arphdr)
goto done;
// validate the ARP header
// 验证硬件类型(hrd)是否为以太网类型,协议类型(pro)是否为IP类型
// 以太网地址长度(hln)是否为以太网地址长度,IP地址长度(pln)是否为IP地址长度
if (ntohs(arphdr->hrd) != ARP_HRD_ETHER ||
ntohs(arphdr->pro) != ETHTYPE_IP ||
arphdr->hln != ETHADDR_LEN ||
arphdr->pln != sizeof(uint32)) {
goto done;
}
// only requests are supported so far
// check if our IP was solicited
// 检查ARP操作类型是否为请求(ARP_OP_REQUEST)以及目标IP地址(tip)是否为本地IP地址(local_ip)
tip = ntohl(arphdr->tip); // target IP address
if (ntohs(arphdr->op) != ARP_OP_REQUEST || tip != local_ip)
goto done;
// handle the ARP request
// 如果以上条件满足,则表示接收到了对本地IP地址的ARP请求
// 将发送方的以太网地址(发送方的MAC地址)和发送方的IP地址(qemu's slirp)保存到相应的变量中
memmove(smac, arphdr->sha, ETHADDR_LEN); // sender's ethernet address
sip = ntohl(arphdr->sip); // sender's IP address (qemu's slirp)
// 调用net_tx_arp函数发送ARP应答(ARP_OP_REPLY)给发送方
net_tx_arp(ARP_OP_REPLY, smac, sip);
done:
mbuffree(m);
}
// 一个ARP数据包(在以太网头部之后)。
struct arp {
uint16 hrd; // 硬件地址格式
uint16 pro; // 协议地址格式
uint8 hln; // 硬件地址长度
uint8 pln; // 协议地址长度
uint16 op; // 操作类型
char sha[ETHADDR_LEN]; // 发送方硬件地址
uint32 sip; // 发送方IP地址
char tha[ETHADDR_LEN]; // 目标硬件地址
uint32 tip; // 目标IP地址
} __attribute__((packed));
static int
net_tx_arp(uint16 op, uint8 dmac[ETHADDR_LEN], uint32 dip)
{
struct mbuf *m;
struct arp *arphdr;
// 分配一个新的mbuf
m = mbufalloc(MBUF_DEFAULT_HEADROOM);
if (!m)
return -1;
// 填充ARP头部的通用部分
arphdr = mbufputhdr(m, *arphdr);
arphdr->hrd = htons(ARP_HRD_ETHER);
arphdr->pro = htons(ETHTYPE_IP);
arphdr->hln = ETHADDR_LEN;
arphdr->pln = sizeof(uint32);
arphdr->op = htons(op);
// 填充ARP头部的以太网和IP部分
memmove(arphdr->sha, local_mac, ETHADDR_LEN);
arphdr->sip = htonl(local_ip);
memmove(arphdr->tha, dmac, ETHADDR_LEN);
arphdr->tip = htonl(dip);
// 头部填充完毕,发送数据包
net_tx_eth(m, ETHTYPE_ARP);
return 0;
}