mmap涉及到操作系统底层很多知识,目前粗略介绍一下大概的逻辑,等深入理解后再继续。操作系统用vma链表管理内存,mmap就是申请一个新的vma供进程使用。可以当作内存使用,也可以当做文件来使用vma对应的这片空间。但是申请的vma,还不会分配物理地址。等到真正访问这片地址的时候,由缺页处理程序作物理页的映射。
下面是mmap函数入口,没有太多逻辑。
asmlinkage int sys_mmap(unsigned long *buffer)
{
int error;
unsigned long flags;
struct file * file = NULL;
error = verify_area(VERIFY_READ, buffer, 6*sizeof(long));
if (error)
return error;
flags = get_fs_long(buffer+3);
// 不是匿名映射,则判断文件的合法性
if (!(flags & MAP_ANONYMOUS)) {
unsigned long fd = get_fs_long(buffer+4);
if (fd >= NR_OPEN || !(file = current->files->fd[fd]))
return -EBADF;
}
return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1),
get_fs_long(buffer+2), flags, get_fs_long(buffer+5));
}
接下来看do_mmap函数
unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long off)
{
int error;
struct vm_area_struct * vma;
// 长度为0
if ((len = PAGE_ALIGN(len)) == 0)
return addr;
// 开始地址或结束地址不在用户空间
if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len)
return -EINVAL;
/* offset overflow? */
if (off + len < off)
return -EINVAL;
/*
* do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
// 文件映射
if (file != NULL) {
// 映射文件的方式
switch (flags & MAP_TYPE) {
// 共享,每个进程都可见,并且修改会同步到硬盘
case MAP_SHARED:
// 设置了写,但是文件不可写则报错,如果是共享只读是可以的
if ((prot & PROT_WRITE) && !(file->f_mode & 2))
return -EACCES;
/* fall through */
// 私有映射,修改文件不会同步到硬盘
case MAP_PRIVATE:
// 不可读
if (!(file->f_mode & 1))
return -EACCES;
break;
default:
return -EINVAL;
}
// 禁止写但是写者大于1,该标记已废弃
if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0))
return -ETXTBSY;
} else if ((flags & MAP_TYPE) != MAP_PRIVATE) // 匿名映射需要是私有映射,匿名无法共享
return -EINVAL;
/*
* obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
// 映射的地址一定是addr
if (flags & MAP_FIXED) {
// 不是页对齐
if (addr & ~PAGE_MASK)
return -EINVAL;
//
if (len > TASK_SIZE || addr > TASK_SIZE - len)
return -EINVAL;
} else {
// 获取一个没使用的地址,由vma管理
addr = get_unmapped_area(len);
if (!addr)
return -ENOMEM;
}
/*
* determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
if (file && (!file->f_op || !file->f_op->mmap))
return -ENODEV;
// 申请一个vma
vma = (struct vm_area_struct *)kmalloc(sizeof(struct vm_area_struct),
GFP_KERNEL);
if (!vma)
return -ENOMEM;
vma->vm_task = current;
// 记录vma管理的地址
vma->vm_start = addr;
vma->vm_end = addr + len;
// 设置读写执行标记的值
vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC);
// 设置其他标记的值
vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE);
// 文件映射
if (file) {
// 可读
if (file->f_mode & 1)
// 可以修改读写执行位
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
// 共享
if (flags & MAP_SHARED) {
// 设置共享位,并且设置可以修改共享位
vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
/*
* This looks strange, but when we don't have the file open
* for writing, we can demote the shared mapping to a simpler
* private mapping. That also takes care of a security hole
* with ptrace() writing to a shared mapping without write
* permissions.
*
* We leave the VM_MAYSHARE bit on, just to get correct output
* from /proc/xxx/maps..
*/
// 文件不可写,设置vma属性为不可写,不能共享
if (!(file->f_mode & 2))
vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
}
} else
// 匿名映射默认属性
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
// 用户层到页表项层的格式转换
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_offset = off;
// 初始化为NULL,在文件系统的mmap函数可以设置
vma->vm_inode = NULL;
vma->vm_pte = 0;
// 解除旧的映射
do_munmap(addr, len); /* Clear old maps */
// 调用文件系统的mmap
if (file)
error = file->f_op->mmap(file->f_inode, file, vma);
else
// 匿名映射
error = anon_map(NULL, NULL, vma);
if (error) {
kfree(vma);
return error;
}
// 插入新的vma到链表和avl树
insert_vm_struct(current, vma);
merge_segments(current, vma->vm_start, vma->vm_end);
return addr;
}
从上代码中可以大概知道,do_mmap主要的逻辑是把vma插入到进程的vma链表和avl树。如果是文件映射则调用文件系统的mmap函数。否则建立一个匿名映射。下面我们接着看文件映射和匿名映射具体做了什么。首先看匿名映射。函数是anon_map。
static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma)
{
if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
return -ENOMEM;
return 0;
}
// 设置address到address+size地址范围内的页目录、页表内容
int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
{
int error = 0;
pgd_t * dir;
unsigned long end = address + size;
pte_t zero_pte;
// 新的页表项,设置写保护,ZERO_PAGE is a global shared page that is always zero
zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
// 取得最高级页目录项的地址
dir = pgd_offset(current, address);
// 设置地址范围内的页目录、页表内容为zero_pte
while (address < end) {
pmd_t *pmd = pmd_alloc(dir, address);
error = -ENOMEM;
if (!pmd)
break;
error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
if (error)
break;
// 下一个最高级页目录项
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
// 刷新快表
invalidate();
return error;
}
// 重新设置address到address+size地址范围内的页表项的内容,释放旧的物理地址
static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
{
unsigned long end;
// 屏蔽高位
address &= ~PMD_MASK;
// 末地址
end = address + size;
// 末地址是否超过了该页目录项管理的地址范围
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
pte_t oldpage = *pte;
// 设置页表项的新内容
*pte = zero_pte;
// 释放旧的物理页
forget_pte(oldpage);
// 下一个页表项
address += PAGE_SIZE;
pte++;
} while (address < end);
}
// 重新设置address到address+size地址范围内的页目录项内容,释放旧的物理页
static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
{
unsigned long end;
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
pte_t * pte = pte_alloc(pmd, address);
if (!pte)
return -ENOMEM;
// 重新设置一个页目录项的内容和释放物理页
zeromap_pte_range(pte, address, end - address, zero_pte);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
return 0;
}
// 释放页表项对应虚拟地址的物理页
static inline void forget_pte(pte_t page)
{
if (pte_none(page))
return;
// 页表项映射了物理内存
if (pte_present(page)) {
// 物理页引用数减一
free_page(pte_page(page));
// 是保留页则直接返回
if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED)
return;
if (current->mm->rss <= 0)
return;
// 进程驻留内存的页数减一
current->mm->rss--;
return;
}
// 释放交换区
swap_free(pte_val(page));
}
匿名映射主要是建立了一系列的页目录、页表,然后初始化。但是还没有映射物理页面。等到进程访问这个虚拟地址范围的时候。在缺页中断处理函数do_no_page中会处理。
void do_no_page(struct vm_area_struct * vma, unsigned long address,
int write_access)
{
...
if (!vma->vm_ops || !vma->vm_ops->nopage) {
// 常驻内存集大小加一
++vma->vm_task->mm->rss;
// 缺页次数加一
++vma->vm_task->mm->min_flt;
// 获取一个物理页并且把物理页地址写到页表项page_table中
get_empty_page(vma, page_table);
return;
}
...
}
// 获取一个新的物理页并记录到页表项里
static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
{
unsigned long tmp;
// 申请失败则标记失败
if (!(tmp = get_free_page(GFP_KERNEL))) {
oom(vma->vm_task);
put_page(page_table, BAD_PAGE);
return;
}
// 建立虚拟地址到物理地址的映射
put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
}
// 复制页表项内容
static void put_page(pte_t * page_table, pte_t pte)
{
// 页表项已经保存了映射信息
if (!pte_none(*page_table)) {
printk("put_page: page already exists %08lx\n", pte_val(*page_table));
free_page(pte_page(pte));
return;
}
/* no need for invalidate */
// 复制
*page_table = pte;
}
以上就是匿名映射的原理。
下面看看文件映射的过程。从开始的分析中我们知道。mmap会执行文件系统的mmap函数。文件系统的mmap函数使用的是filemmap.c里的generic_mmap函数。
/* This is used for a general mmap of a disk file */
int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
{
struct vm_operations_struct * ops;
// 没有按文件系统的块大小对齐
if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
return -EINVAL;
// 没有超级块或者不是一般文件
if (!inode->i_sb || !S_ISREG(inode->i_mode))
return -EACCES;
if (!inode->i_op || !inode->i_op->bmap)
return -ENOEXEC;
// 私有映射
ops = &file_private_mmap;
// 共享映射
if (vma->vm_flags & VM_SHARED) {
if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) {
static int nr = 0;
ops = &file_shared_mmap;
#ifndef SHARED_MMAP_REALLY_WORKS /* it doesn't, yet */
if (nr++ < 5)
printk("%s tried to do a shared writeable mapping\n", current->comm);
return -EINVAL;
#endif
}
}
if (!IS_RDONLY(inode)) {
inode->i_atime = CURRENT_TIME;
inode->i_dirt = 1;
}
// 建立文件和vma的关系
vma->vm_inode = inode;
// inode引用数加一
inode->i_count++;
// 设置操作函数集
vma->vm_ops = ops;
return 0;
}
我们看到主要是设置了inode和操作函数集。然后再看看函数集都有什么。
/*
* Shared mappings need to be able to do the right thing at
* close/unmap/sync. They will also use the private file as
* backing-store for swapping..
*/
static struct vm_operations_struct file_shared_mmap = {
NULL, /* open */
filemap_close, /* close */
filemap_unmap, /* unmap */
NULL, /* protect */
filemap_sync, /* sync */
NULL, /* advise */
filemap_nopage, /* nopage */
NULL, /* wppage */
filemap_swapout, /* swapout */
NULL, /* swapin */
};
/*
* Private mappings just need to be able to load in the map
*
* (this is actually used for shared mappings as well, if we
* know they can't ever get write permissions..)
*/
static struct vm_operations_struct file_private_mmap = {
NULL, /* open */
NULL, /* close */
NULL, /* unmap */
NULL, /* protect */
NULL, /* sync */
NULL, /* advise */
filemap_nopage, /* nopage */
NULL, /* wppage */
NULL, /* swapout */
NULL, /* swapin */
};
对于私有和共享映射分别有一套函数集。这里主要关注缺页处理函数。该函数由操作系统的缺页中断函数调用。接着我们看一下缺页中断时的处理,下面是缺页中断处理函数。
void do_no_page(struct vm_area_struct * vma, unsigned long address,
int write_access)
{
pte_t * page_table;
pte_t entry;
unsigned long page;
// 在进程页表里获取address对应的页表项地址
page_table = get_empty_pgtable(vma->vm_task,address);
// 分配失败则返回
if (!page_table)
return;
entry = *page_table;
// 已经建立了虚拟地址到物理地址的映射,返回
if (pte_present(entry))
return;
// 还没有建立映射
if (!pte_none(entry)) {
do_swap_page(vma, address, page_table, entry, write_access);
return;
}
// 屏蔽低12位,得到真正虚拟地址
address &= PAGE_MASK;
// 没有no_page说明不是文件mmap
if (!vma->vm_ops || !vma->vm_ops->nopage) {
// 常驻内存集大小加一
++vma->vm_task->mm->rss;
// 缺页次数加一
++vma->vm_task->mm->min_flt;
// 获取一个物理页并且把物理页地址写到页表项page_table中
get_empty_page(vma, page_table);
return;
}
// 申请一页
page = get_free_page(GFP_KERNEL);
// 判断是否已经加载过这个页面的数据
if (share_page(vma, address, write_access, page)) {
// 缺页但是不需要从硬盘加载数据,min_flt加一
++vma->vm_task->mm->min_flt;
// 常驻内存页数加一
++vma->vm_task->mm->rss;
return;
}
if (!page) {
oom(current);
put_page(page_table, BAD_PAGE);
return;
}
// 缺页并且需要从硬盘加载数据,maj_flt次数加一
++vma->vm_task->mm->maj_flt;
// 常驻内存页数加一
++vma->vm_task->mm->rss;
/*
* The fourth argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection
*/
// 调文件系统的no_page函数,该函数在filemap.c中,address是虚拟地址,page是物理地址
page = vma->vm_ops->nopage(vma, address, page,
write_access && !(vma->vm_flags & VM_SHARED));
if (share_page(vma, address, write_access, 0)) {
free_page(page);
return;
}
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* a exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
// 新建一个页表项,page是物理地址
entry = mk_pte(page, vma->vm_page_prot);
// 设置写保护或者可写
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
// 把物理地址和相关信息写入页表项
put_page(page_table, entry);
}
首先判断该地址对应的数据是不是被其他进程加载过,并且可以共享,是的话则不需要到硬盘加载。否则申请一个物理页。然后执行在mmap的时候设置的函数集中的filemap_nopage函数。
static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address,
unsigned long page, int no_share)
{
struct inode * inode = area->vm_inode;
unsigned int block;
int nr[8];
int i, *p;
// 屏蔽掉低12位
address &= PAGE_MASK;
// vm_offset代表文件从该处开始被映射,对于vm_start的值,所以算出相对的块大小,还有加上偏移,得到绝对大小
block = address - area->vm_start + area->vm_offset;
// 算出是文件中的第几块
block >>= inode->i_sb->s_blocksize_bits;
// 一页包括多少块,即读进来的数据要是页的整数倍,假设是2,则至少要读两块
i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
p = nr;
do {
// 算出当前块对应的硬盘块号,存在p中,即nr数组中
*p = bmap(inode,block);
// 一页包括多少块则多读多少块,即循环多少次
i--;
// 需要读的下一块块号
block++;
// 指向数组下一个元素的地址
p++;
} while (i > 0);
// 把内容读到page中,page是物理地址
return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share);
}
该函数就是算出要加载的数据在硬盘哪个块,然后加载进来,并写入物理页中。然后返回物理地址。我们再回到操作系统的缺页中断函数中,从vma->vm_ops->nopage继续往下看。
page = vma->vm_ops->nopage(vma, address, page,
write_access && !(vma->vm_flags & VM_SHARED));
if (share_page(vma, address, write_access, 0)) {
free_page(page);
return;
}
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* a exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
// 新建一个页表项,page是物理地址
entry = mk_pte(page, vma->vm_page_prot);
// 设置写保护或者可写
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
// 把物理地址和相关信息写入页表项
put_page(page_table, entry);
page_table是在该函数开始处申请的一个页表项地址。然后把物理地址记录在页表项中,最后把页表项的内容写入页表项。下次访问这一页就不会发生中断了。
这就是文件映射的原理。