Postgresql存储底层封装smgr源码分析

mingjie

发布于 2022-05-12 10:56:32

7370

发布于 2022-05-12 10:56:32

文章被收录于专栏：Postgresql源码分析

注：本专栏所有分析以函数为主线，必要数据结构会带入讲解；数据库版本为Postgresql10.16。注：如有讨论的需要请email to jackgo73@outlook.com

概要

Postgresql外存管理器封装了与文件系统的所有接口，提供更高级、更适用于数据库的文件操作接口，这部分代码的位置：

$ pwd
/home/mingjie.gmj/dev/src/postgresql-10.16/src/backend/storage/smgr

$ ll *.c
-rw-r--r-- 1 mingjie.gmj mingjie.gmj 63002 Feb  9 05:59 md.c
-rw-r--r-- 1 mingjie.gmj mingjie.gmj 22203 Feb  9 05:59 smgr.c
-rw-r--r-- 1 mingjie.gmj mingjie.gmj  1521 Feb  9 05:59 smgrtype.c

本篇针对smgr中的函数进行拆解、分析，深入理解pg对于外存的管理细节。

关键流程分析

1 初始化流程init

smgr初始化流程分析，注意初始化发生在每一个backend服务进程启动时，注意不是Postmaster。资源是backend-local的，进程退出了资源也就释放了。

1.1 入口接口函数：smgrinit

void
smgrinit(void)
{
    int         i;

    for (i = 0; i < NSmgr; i++)
    {
        if (smgrsw[i].smgr_init)
            (*(smgrsw[i].smgr_init)) ();
    }

    /* register the shutdown proc */
    on_proc_exit(smgrshutdown, 0);
}

（1）NSmgr：smgrsw[]数组长度

（2）smgrsw数组保存了一套接口函数，当前只有一套实现方法，就是md系列函数，定义在md.c中。

static const f_smgr smgrsw[] = {
    /* magnetic disk */
    {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
        mdprefetch, mdread, mdwrite, mdwriteback, mdnblocks, mdtruncate,
        mdimmedsync, mdpreckpt, mdsync, mdpostckpt
    }
};

1.2 入口函数 mdinit

核心流程创建哈希表

        HASHCTL     hash_ctl;
        pendingOpsCxt = AllocSetContextCreate(MdCxt,
                                              "Pending ops context",
                                              ALLOCSET_DEFAULT_SIZES);
        MemoryContextAllowInCriticalSection(pendingOpsCxt, true);

        MemSet(&hash_ctl, 0, sizeof(hash_ctl));
        hash_ctl.keysize = sizeof(RelFileNode);
        hash_ctl.entrysize = sizeof(PendingOperationEntry);
        hash_ctl.hcxt = pendingOpsCxt;
        pendingOpsTable = hash_create("Pending Ops Table",
                                      100L,
                                      &hash_ctl,
                                      HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
        pendingUnlinks = NIL;

这里需要牢记是哈希表的key和entry的类型：

key：RelFileNode

typedef struct RelFileNode
{
    Oid         spcNode;        /* tablespace */
    Oid         dbNode;         /* database */
    Oid         relNode;        /* relation */
} RelFileNode;

value：PendingOperationEntry

typedef uint16 CycleCtr;        /* can be any convenient integer size */

typedef struct
{
    RelFileNode rnode;          /* hash table key (must be first!) */
    CycleCtr    cycle_ctr;      /* mdsync_cycle_ctr of oldest request */
    /* requests[f] has bit n set if we need to fsync segment n of fork f */
    Bitmapset  *requests[MAX_FORKNUM + 1];
    /* canceled[f] is true if we canceled fsyncs for fork "recently" */
    bool        canceled[MAX_FORKNUM + 1];
} PendingOperationEntry;

Bitmapset

/* The unit size can be adjusted by changing these three declarations: */
#define BITS_PER_BITMAPWORD 32
typedef uint32 bitmapword;      /* must be an unsigned type */
typedef int32 signedbitmapword; /* must be the matching signed type */

typedef struct Bitmapset
{
    int         nwords;         /* number of words in array */
    bitmapword  words[FLEXIBLE_ARRAY_MEMBER];   /* really [nwords] */
} Bitmapset;

每次checkpoint后面的所有操作页面操作都要记录下来，下次chk的时候才能全部刷到磁盘上。

用哈希表会记录所有需要的操作，方便合并重复操作。

2 打开表文件

/*
 * RelationOpenSmgr
 *      Open the relation at the smgr level, if not already done.
 */
#define RelationOpenSmgr(relation) \
    do { \
        if ((relation)->rd_smgr == NULL) \
            smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \
    } while (0)

smgrsetowner比较简单，即在relation中记录open后的结构

&((relation)->rd_smgr) = SMgrRelation

关键的是smgropen

2.1 smgropen

【小技巧】 1、打点位置：b bufmgr.c:647 调试要第一次打开表文件，否则进不去 2、对文件的所有函数打点：rbreak md.c:.

先看入参：smgropen(RelFileNode rnode, BackendId backend)

RelFileNode rnode
- Relation->rd_node: Relation{ RelFileNode rd_node; /* relation physical identifier */ }
BackendId backend
- Relation->rd_backend: Relation{ BackendId rd_backend; /* owning backend id, if temporary relation */}

实际传入表物理文件ID和当前操作的bkID（正常表是-1，临时表才有）。

例如下面实例分别对应表空间、数据库、表文件的oid。

(gdb) p rnode

$2 = {spcNode = 1663, dbNode = 13214, relNode = 2684}

SMgrRelation
smgropen(RelFileNode rnode, BackendId backend)
{
    RelFileNodeBackend brnode;
    SMgrRelation reln;
    bool        found;

    // 创建哈希表，如果走过init过程就已经创建了
    if (SMgrRelationHash == NULL)
    {
        /* First time through: initialize the hash table */
        HASHCTL     ctl;

        MemSet(&ctl, 0, sizeof(ctl));
        ctl.keysize = sizeof(RelFileNodeBackend);
        ctl.entrysize = sizeof(SMgrRelationData);
        SMgrRelationHash = hash_create("smgr relation table", 400,
                                       &ctl, HASH_ELEM | HASH_BLOBS);
        dlist_init(&unowned_relns);
    }

    // 用RNODE找到对应的SMgrRelation结构
    /* Look up or create an entry */
    brnode.node = rnode;
    brnode.backend = backend;
    reln = (SMgrRelation) hash_search(SMgrRelationHash,
                                      (void *) &brnode,
                                      HASH_ENTER, &found);

    // 没有就创建一个
    /* Initialize it if not present before */
    if (!found)
    {
        int         forknum;

        /* hash_search already filled in the lookup key */
        reln->smgr_owner = NULL;
        reln->smgr_targblock = InvalidBlockNumber;
        reln->smgr_fsm_nblocks = InvalidBlockNumber;
        reln->smgr_vm_nblocks = InvalidBlockNumber;
        reln->smgr_which = 0;   /* we only have md.c at present */

        /* mark it not open */
        for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
            reln->md_num_open_segs[forknum] = 0;

        /* it has no owner yet */
        dlist_push_tail(&unowned_relns, &reln->node);
    }

    return reln;
}

找不到就初始化一个SMgrRelation结构，例如第一次访问返回的reln：

{
    smgr_rnode = 
    {
        node = 
        {
            spcNode = 1663, 
            dbNode = 13214, 
            relNode = 2684}, 
            backend = -1
        },
     smgr_owner = 0x0, 
     smgr_targblock = 4294967295, 
     smgr_fsm_nblocks = 4294967295,
     smgr_vm_nblocks = 4294967295, 
     smgr_which = 0, 
     md_num_open_segs = {0, 0, 0, 0}, 
     md_seg_fds = {0x0,0x0, 0x0, 0x0}, 
     node = {
         prev = 0xf37060 <unowned_relns>, next = 0xf37060 <unowned_relns>
     }
 }

2.2 mdread

mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,char *buffer)

参数

SMgrRelation reln

{smgr_rnode = {node = {spcNode = 1663, dbNode = 13214, relNode = 2684}, backend = -1},smgr_owner = 0x7f7016220d80, smgr_targblock = 4294967295, smgr_fsm_nblocks = 4294967295,smgr_vm_nblocks = 4294967295, smgr_which = 0, md_num_open_segs = {1, 0, 0, 0}, md_seg_fds = {0x1cef810, 0x0, 0x0, 0x0}, node = {prev = 0xf37060 <unowned_relns>,next = 0xf37060 <unowned_relns>}}

ForkNumber forknum

MAIN_FORKNUM

BlockNumber blocknum

1

核心逻辑三步

第一步：打开MdfdVec

v = _mdfd_getseg(reln, forknum, blocknum, false,EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);

第二步：定位文件位置

seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));

第三步：读取

nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);

2.2 _mdfd_getseg

_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)

参数

前三同上

skipFsync

false

behavior

EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY

数据结构

代表逻辑上的一张表

typedef struct SMgrRelationData
{
    /* rnode is the hashtable lookup key, so it must be first! */
    RelFileNodeBackend smgr_rnode;  /* relation physical identifier */

    /* pointer to owning pointer, or NULL if none */
    struct SMgrRelationData **smgr_owner;

    /*
     * These next three fields are not actually used or manipulated by smgr,
     * except that they are reset to InvalidBlockNumber upon a cache flush
     * event (in particular, upon truncation of the relation).  Higher levels
     * store cached state here so that it will be reset when truncation
     * happens.  In all three cases, InvalidBlockNumber means "unknown".
     */
    BlockNumber smgr_targblock; /* current insertion target block */
    BlockNumber smgr_fsm_nblocks;   /* last known size of fsm fork */
    BlockNumber smgr_vm_nblocks;    /* last known size of vm fork */

    /* additional public fields may someday exist here */

    /*
     * Fields below here are intended to be private to smgr.c and its
     * submodules.  Do not touch them from elsewhere.
     */
    int         smgr_which;     /* storage manager selector */

    /*
     * for md.c; per-fork arrays of the number of open segments
     * (md_num_open_segs) and the segments themselves (md_seg_fds).
     */
    int         md_num_open_segs[MAX_FORKNUM + 1];
    struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];

    /* if unowned, list link in list of all unowned SMgrRelations */
    dlist_node  node;
} SMgrRelationData;

逻辑

第一步：对齐块号

targetseg = blkno / ((BlockNumber) RELSEG_SIZE);

注意：#define RELSEG_SIZE 131072 一个文件中允许存在的最大页面数量，一个页面8k，131072 x 8 = 1048576 k = 1G

注意：这里是➗，会映射到某一个表文件上，所以PG中的表文件最大1G超了会切分，但是块号是全局定义的，这里做下映射。

第二步：判断该reln的块是否已经打开

if (targetseg < reln->md_num_open_segs[forknum]) {

v = &reln->md_seg_fds[forknum][targetseg];

如果已经打开直接使用，只有主文件可能有多个targetseg，超1G了会切分。

第三步：假设md_num_open_segs都是0说明还没有打开，使用mdopen函数打开。注意只打开第一个块。

mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)

参数：

mdopen (reln=0x1ceb618, forknum=MAIN_FORKNUM, behavior=9) at md.c:623

static MdfdVec *
mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
{
    MdfdVec    *mdfd;
    char       *path;
    File        fd;

    /* No work if already open */
    if (reln->md_num_open_segs[forknum] > 0)
        return &reln->md_seg_fds[forknum][0];

    path = relpath(reln->smgr_rnode, forknum);

    fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);

    if (fd < 0)
    {
        /*
         * During bootstrap, there are cases where a system relation will be
         * accessed (by internal backend processes) before the bootstrap
         * script nominally creates it.  Therefore, accept mdopen() as a
         * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
         */
        if (IsBootstrapProcessingMode())
            fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
        if (fd < 0)
        {
            if ((behavior & EXTENSION_RETURN_NULL) &&
                FILE_POSSIBLY_DELETED(errno))
            {
                pfree(path);
                return NULL;
            }
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not open file \"%s\": %m", path)));
        }
    }

    pfree(path);

    _fdvec_resize(reln, forknum, 1);
    mdfd = &reln->md_seg_fds[forknum][0];
    mdfd->mdfd_vfd = fd;
    mdfd->mdfd_segno = 0;

    Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));

    return mdfd;
}

第四步：从当前md_seg_fds里面取出的块位置（第一次的话经过第三步应该只有第一块）到需要的块位置targetseg之间的所有块，依次遍历打开。

for (nextsegno = reln->md_num_open_segs[forknum];nextsegno <= targetseg; nextsegno++)

{

（1）计算当前MdfdVec指向的文件，有多少个块

BlockNumber nblocks = _mdnblocks(reln, forknum, v);

（2）按需扩展新文件（表文件切分）

（2）打开文件，维护两个数组结构

v = _mdfd_openseg(reln, forknum, nextsegno, flags);

总结

最重要的两个数组：

typedef struct SMgrRelationData
{
    ...
    int         md_num_open_segs[MAX_FORKNUM + 1];
    struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
    ....
} SMgrRelationData;

第一个记录segs号，也就是文件切分后的块号，小于1G的表不切分只有一个文件，对应位置存1。数组的四个位置对应三类文件。

第二个记录fd结构的指针，四个位置放四个指针数组的头部地址，数组的每个位置记录文件的fd和打开的最大的块号。

【简版】