业余时间写的玩具操作系统,准备把内存管理部分加强一下,伙伴系统分配页面算法已经完成,下面就要开始写更加细粒度的内存分配,毕竟伙伴系统是按照页为基本单位分配的,参考内核版本linux2.6.30,没分析高版本的源码,算法基本思想应该差不多。
slab算法是一个高效的内存分配算法,它通过把经常使用的内存块比如32字节,64字节大小或者某个常用结构体大小的类型组织成一个kmem_cache结构,经常分配和释放的内存会保存在一个array_cache数组里,这样对频繁分配和释放的内存,分配和回收效率都是O(1)。kmalloc函数就是从slab分配的。整体结构就是如下图,其中一个slab包含1到多个页面,slab管理结构可能在页面上,也可能从其它kmem_cache上动态分配的。
源码分析:为了较好的理解算法核心思想,内核各种锁和bug检查代码先不去分析,kmem_cache初始化,先看几个重要的结构体
/*
* struct kmem_cache
*
* manages a cache.
*/
/*这个结构体是slab的核心,每个类型内存都需要创建一个这样的结构体,比如32字节,64字节,task_struct类型的等等*/
struct kmem_cache {
/* 1) per-cpu data, touched during every alloc/free */
/*表示本cpu的缓存数组,如果是单核NR_CPUS就是1*/
struct array_cache *array[NR_CPUS];
/* 2) Cache tunables. Protected by cache_chain_mutex */
unsigned int batchcount; //每次更新array_cache数组元素个数
unsigned int limit; //array_cache数组大小
unsigned int shared; //cpu共享个数
unsigned int buffer_size; //当前对象大小
u32 reciprocal_buffer_size; //当前对象大小个数
/* 3) touched by every alloc & free from the backend */
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
/* 4) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder; //每个slab包含页面的阶
/* force GFP flags, e.g. GFP_DMA */
gfp_t gfpflags; //分配标志
size_t colour; /* cache colouring range */ //每个slab颜色个数
unsigned int colour_off; /* colour offset */ //着色偏移,等于cacheline的大小
struct kmem_cache *slabp_cache;
unsigned int slab_size; //slab管理结构的大小
unsigned int dflags; /* dynamic flags */
/* constructor func */
void (*ctor)(void *obj); //初始化对象的回调函数,用户自己设置
/* 5) cache creation/removal */
const char *name; //kmem_cache的名字
struct list_head next; //kmem_cache链表
/*
* We put nodelists[] at the end of kmem_cache, because we want to size
* this array to nr_node_ids slots instead of MAX_NUMNODES
* (see kmem_cache_init())
* We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
* is statically defined, so we reserve the max number of nodes.
*/
struct kmem_list3 *nodelists[MAX_NUMNODES]; //三链结构,UMA架构下MAX_NUMNODES是1
/*
* Do not add fields after nodelists[]
*/
};
/*
* The slab lists for all objects.
*/
//主要连接3种类型的slab链表
struct kmem_list3 {
struct list_head slabs_partial; /* partial list first, better asm code *///部分空slab类型链表
struct list_head slabs_full; //满slab类型链表
struct list_head slabs_free; //空slab类型链表
unsigned long free_objects; //空闲对象个数
unsigned int free_limit; //空闲对象大小限制
unsigned int colour_next; /* Per-node cache coloring *///下一个偏移个数
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */ //numa共享node的节点
struct array_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
};
/*
* bootstrap: The caches do not work without cpuarrays anymore, but the
* cpuarrays are allocated from the generic caches...
*/
#define BOOT_CPUCACHE_ENTRIES 1
struct arraycache_init {
struct array_cache cache;
void *entries[BOOT_CPUCACHE_ENTRIES];
};
struct array_cache {
unsigned int avail;
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
spinlock_t lock;
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
*/
};
主要分析UMA类型的
/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
* 初始化kmem_cache,主要初始化创建初始化的kmem_cache的slab
*/
void kmem_cache_init(void)
{
size_t left_over; //剩余字节
struct cache_sizes *sizes; //初始化kmem_cache的size
struct cache_names *names; //初始化kmem_cache的名字
int i;
int order;
int node;
//非NUMA架构
/*if (num_possible_nodes() == 1) {
use_alien_caches = 0;
numa_platform = 0;
}*/
//NUM_INIT_LISTS为3
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
if (i < MAX_NUMNODES)
cache_cache.nodelists[i] = NULL; //cache_cache结构就是kmem_cache类型的slab
}
set_up_list3s(&cache_cache, CACHE_CACHE);
/*
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
*/
if (num_physpages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the cache_cache cache: it contains the struct
* kmem_cache structures of all caches, except cache_cache itself:
* cache_cache is statically allocated.
* Initially an __init data area is used for the head array and the
* kmem_list3 structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
* The struct kmem_cache for the new cache is allocated normally.
* An __init data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays.
* 4) Replace the __init data head arrays for cache_cache and the first
* kmalloc cache with kmalloc allocated arrays.
* 5) Replace the __init data for kmem_list3 for cache_cache and
* the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
node = numa_node_id(); //node = 0
/* 1) create the cache_cache */
INIT_LIST_HEAD(&cache_chain); //初始化kmem_cache链表头
list_add(&cache_cache.next, &cache_chain);//把cache_cache结构体加入链表
cache_cache.colour_off = cache_line_size(); //cache_cache的着色偏移,cache_line的大小
//cache_cache的空闲数组指向初始化的array数组
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
//cache_cache的三链指向初始化的三链
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
* struct kmem_cache size depends on nr_node_ids, which
* can be less than MAX_NUMNODES.
*/
//cache_cache的对象大小,也就是kmem_cache的大小
cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1 * sizeof(struct kmem_list3 *);
//对象大小和cache_line对齐
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
cache_line_size());
//kmem_cache大小的倒数
//cache_cache.reciprocal_buffer_size =
// reciprocal_value(cache_cache.buffer_size);
//计算所需页面的阶,对象个数,剩余字节数
for (order = 0; order < MAX_ORDER; order++) {
cache_estimate(order, cache_cache.buffer_size,
cache_line_size(), 0, &left_over, &cache_cache.num);
if (cache_cache.num)
break;
}
//BUG_ON(!cache_cache.num);
cache_cache.gfporder = order;
//颜色个数等于剩余字节数除以cacheline的大小,因为slab初始位置是按照空闲字节数做偏移的
//最大偏移到空闲字节
cache_cache.colour = left_over / cache_cache.colour_off;
//存放slab_size大小,slab_size包括slab结构体大小,包括object索引数组大小
//kmem_bufctl_t就是数组索引大小,比如a[1] = 2;
cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
/* 2+3) create the kmalloc caches */
sizes = malloc_sizes;
names = cache_names;
/*
* Initialize the caches that provide memory for the array cache and the
* kmem_list3 structures first. Without this, further allocations will
* bug.
*/
//创建对象为32字节的缓存
sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
sizes[INDEX_AC].cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
printf("sizes[INDEX_AC].cs_cachep:0x%p\n",sizes[INDEX_AC].cs_cachep);
//创建对象为64字节的对象
if (INDEX_AC != INDEX_L3) {
sizes[INDEX_L3].cs_cachep =
kmem_cache_create(names[INDEX_L3].name,
sizes[INDEX_L3].cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
}
//slab早期初始化结束
slab_early_init = 0;
while (sizes->cs_size != ULONG_MAX) {
/*
* For performance, all the general caches are L1 aligned.
* This should be particularly beneficial on SMP boxes, as it
* eliminates "false sharing".
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches.
*/
if (!sizes->cs_cachep) {
sizes->cs_cachep = kmem_cache_create(names->name,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
}
sizes++;
names++;
}
/* 4) Replace the bootstrap head arrays */
//替换cache_cache的array_cache成员,使用slab管理的空闲内存替换全局内存区
{
struct array_cache *ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
//local_irq_disable();
//BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
memcpy(ptr, cpu_cache_get(&cache_cache),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
// spin_lock_init(&ptr->lock);
cache_cache.array[smp_processor_id()] = ptr;
// local_irq_enable();
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
//local_irq_disable();
// BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
// != &initarray_generic.cache);
//替换INDEX_AC cache的array_cache成员,也就是32字节的cache
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
// spin_lock_init(&ptr->lock);
malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
ptr;
//local_irq_enable();
}
/* 5) Replace the bootstrap kmem_list3's */
{
int nid;
for(nid = 0; nid < 1; nid++) {
init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC + nid], nid);
if (INDEX_AC != INDEX_L3) {
init_list(malloc_sizes[INDEX_L3].cs_cachep,
&initkmem_list3[SIZE_L3 + nid], nid);
}
}
}
/* 6) resize the head arrays to their final sizes */
{
struct kmem_cache *cachep;
//mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
if (enable_cpucache(cachep))
assert(0);
}
//mutex_unlock(&cache_chain_mutex);
}
/* Annotate slab for lockdep -- annotate the malloc caches */
// init_lock_keys();
/* Done! */
g_cpucache_up = FULL;
/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
// register_cpu_notifier(&cpucache_notifier);
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
*/
}