注:本专栏所有分析以函数为主线,必要数据结构会带入讲解;数据库版本为Postgresql10.16。 注:如有讨论的需要请email to jackgo73@outlook.com
Postgresql外存管理器封装了与文件系统的所有接口,提供更高级、更适用于数据库的文件操作接口,这部分代码的位置:
$ pwd
/home/mingjie.gmj/dev/src/postgresql-10.16/src/backend/storage/smgr
$ ll *.c
-rw-r--r-- 1 mingjie.gmj mingjie.gmj 63002 Feb 9 05:59 md.c
-rw-r--r-- 1 mingjie.gmj mingjie.gmj 22203 Feb 9 05:59 smgr.c
-rw-r--r-- 1 mingjie.gmj mingjie.gmj 1521 Feb 9 05:59 smgrtype.c
本篇针对smgr中的函数进行拆解、分析,深入理解pg对于外存的管理细节。
smgr初始化流程分析,注意初始化发生在每一个backend服务进程启动时,注意不是Postmaster。资源是backend-local的,进程退出了资源也就释放了。
void
smgrinit(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_init)
(*(smgrsw[i].smgr_init)) ();
}
/* register the shutdown proc */
on_proc_exit(smgrshutdown, 0);
}
(1)NSmgr:smgrsw[]数组长度
(2)smgrsw数组保存了一套接口函数,当前只有一套实现方法,就是md系列函数,定义在md.c中。
static const f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
mdprefetch, mdread, mdwrite, mdwriteback, mdnblocks, mdtruncate,
mdimmedsync, mdpreckpt, mdsync, mdpostckpt
}
};
核心流程创建哈希表
HASHCTL hash_ctl;
pendingOpsCxt = AllocSetContextCreate(MdCxt,
"Pending ops context",
ALLOCSET_DEFAULT_SIZES);
MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(RelFileNode);
hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hcxt = pendingOpsCxt;
pendingOpsTable = hash_create("Pending Ops Table",
100L,
&hash_ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
pendingUnlinks = NIL;
这里需要牢记是哈希表的key和entry的类型:
key:RelFileNode
typedef struct RelFileNode
{
Oid spcNode; /* tablespace */
Oid dbNode; /* database */
Oid relNode; /* relation */
} RelFileNode;
value:PendingOperationEntry
typedef uint16 CycleCtr; /* can be any convenient integer size */
typedef struct
{
RelFileNode rnode; /* hash table key (must be first!) */
CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */
/* requests[f] has bit n set if we need to fsync segment n of fork f */
Bitmapset *requests[MAX_FORKNUM + 1];
/* canceled[f] is true if we canceled fsyncs for fork "recently" */
bool canceled[MAX_FORKNUM + 1];
} PendingOperationEntry;
Bitmapset
/* The unit size can be adjusted by changing these three declarations: */
#define BITS_PER_BITMAPWORD 32
typedef uint32 bitmapword; /* must be an unsigned type */
typedef int32 signedbitmapword; /* must be the matching signed type */
typedef struct Bitmapset
{
int nwords; /* number of words in array */
bitmapword words[FLEXIBLE_ARRAY_MEMBER]; /* really [nwords] */
} Bitmapset;
每次checkpoint后面的所有操作页面操作都要记录下来,下次chk的时候才能全部刷到磁盘上。
用哈希表会记录所有需要的操作,方便合并重复操作。
/*
* RelationOpenSmgr
* Open the relation at the smgr level, if not already done.
*/
#define RelationOpenSmgr(relation) \
do { \
if ((relation)->rd_smgr == NULL) \
smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \
} while (0)
smgrsetowner比较简单,即在relation中记录open后的结构
&((relation)->rd_smgr) = SMgrRelation
关键的是smgropen
【小技巧】 1、打点位置:b bufmgr.c:647 调试要第一次打开表文件,否则进不去 2、对文件的所有函数打点:rbreak md.c:.
先看入参:smgropen(RelFileNode rnode, BackendId backend)
实际传入表物理文件ID和当前操作的bkID(正常表是-1,临时表才有)。
例如下面实例分别对应表空间、数据库、表文件的oid。
(gdb) p rnode
$2 = {spcNode = 1663, dbNode = 13214, relNode = 2684}
SMgrRelation
smgropen(RelFileNode rnode, BackendId backend)
{
RelFileNodeBackend brnode;
SMgrRelation reln;
bool found;
// 创建哈希表,如果走过init过程就已经创建了
if (SMgrRelationHash == NULL)
{
/* First time through: initialize the hash table */
HASHCTL ctl;
MemSet(&ctl, 0, sizeof(ctl));
ctl.keysize = sizeof(RelFileNodeBackend);
ctl.entrysize = sizeof(SMgrRelationData);
SMgrRelationHash = hash_create("smgr relation table", 400,
&ctl, HASH_ELEM | HASH_BLOBS);
dlist_init(&unowned_relns);
}
// 用RNODE找到对应的SMgrRelation结构
/* Look up or create an entry */
brnode.node = rnode;
brnode.backend = backend;
reln = (SMgrRelation) hash_search(SMgrRelationHash,
(void *) &brnode,
HASH_ENTER, &found);
// 没有就创建一个
/* Initialize it if not present before */
if (!found)
{
int forknum;
/* hash_search already filled in the lookup key */
reln->smgr_owner = NULL;
reln->smgr_targblock = InvalidBlockNumber;
reln->smgr_fsm_nblocks = InvalidBlockNumber;
reln->smgr_vm_nblocks = InvalidBlockNumber;
reln->smgr_which = 0; /* we only have md.c at present */
/* mark it not open */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
reln->md_num_open_segs[forknum] = 0;
/* it has no owner yet */
dlist_push_tail(&unowned_relns, &reln->node);
}
return reln;
}
找不到就初始化一个SMgrRelation结构,例如第一次访问返回的reln:
{
smgr_rnode =
{
node =
{
spcNode = 1663,
dbNode = 13214,
relNode = 2684},
backend = -1
},
smgr_owner = 0x0,
smgr_targblock = 4294967295,
smgr_fsm_nblocks = 4294967295,
smgr_vm_nblocks = 4294967295,
smgr_which = 0,
md_num_open_segs = {0, 0, 0, 0},
md_seg_fds = {0x0,0x0, 0x0, 0x0},
node = {
prev = 0xf37060 <unowned_relns>, next = 0xf37060 <unowned_relns>
}
}
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,char *buffer)
参数
SMgrRelation reln
{smgr_rnode = {node = {spcNode = 1663, dbNode = 13214, relNode = 2684}, backend = -1},smgr_owner = 0x7f7016220d80, smgr_targblock = 4294967295, smgr_fsm_nblocks = 4294967295,smgr_vm_nblocks = 4294967295, smgr_which = 0, md_num_open_segs = {1, 0, 0, 0}, md_seg_fds = {0x1cef810, 0x0, 0x0, 0x0}, node = {prev = 0xf37060 <unowned_relns>,next = 0xf37060 <unowned_relns>}}
ForkNumber forknum
MAIN_FORKNUM
BlockNumber blocknum
1
核心逻辑三步
第一步:打开MdfdVec
v = _mdfd_getseg(reln, forknum, blocknum, false,EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
第二步:定位文件位置
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
第三步:读取
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
参数
前三同上
skipFsync
false
behavior
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY
数据结构
代表逻辑上的一张表
typedef struct SMgrRelationData
{
/* rnode is the hashtable lookup key, so it must be first! */
RelFileNodeBackend smgr_rnode; /* relation physical identifier */
/* pointer to owning pointer, or NULL if none */
struct SMgrRelationData **smgr_owner;
/*
* These next three fields are not actually used or manipulated by smgr,
* except that they are reset to InvalidBlockNumber upon a cache flush
* event (in particular, upon truncation of the relation). Higher levels
* store cached state here so that it will be reset when truncation
* happens. In all three cases, InvalidBlockNumber means "unknown".
*/
BlockNumber smgr_targblock; /* current insertion target block */
BlockNumber smgr_fsm_nblocks; /* last known size of fsm fork */
BlockNumber smgr_vm_nblocks; /* last known size of vm fork */
/* additional public fields may someday exist here */
/*
* Fields below here are intended to be private to smgr.c and its
* submodules. Do not touch them from elsewhere.
*/
int smgr_which; /* storage manager selector */
/*
* for md.c; per-fork arrays of the number of open segments
* (md_num_open_segs) and the segments themselves (md_seg_fds).
*/
int md_num_open_segs[MAX_FORKNUM + 1];
struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
/* if unowned, list link in list of all unowned SMgrRelations */
dlist_node node;
} SMgrRelationData;
逻辑
第一步:对齐块号
targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
注意:#define RELSEG_SIZE 131072 一个文件中允许存在的最大页面数量,一个页面8k,131072 x 8 = 1048576 k = 1G
注意:这里是➗,会映射到某一个表文件上,所以PG中的表文件最大1G超了会切分,但是块号是全局定义的,这里做下映射。
第二步:判断该reln的块是否已经打开
if (targetseg < reln->md_num_open_segs[forknum]) {
v = &reln->md_seg_fds[forknum][targetseg];
如果已经打开直接使用,只有主文件可能有多个targetseg,超1G了会切分。
第三步:假设md_num_open_segs都是0说明还没有打开,使用mdopen函数打开。注意只打开第一个块。
mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
参数:
mdopen (reln=0x1ceb618, forknum=MAIN_FORKNUM, behavior=9) at md.c:623
static MdfdVec *
mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
{
MdfdVec *mdfd;
char *path;
File fd;
/* No work if already open */
if (reln->md_num_open_segs[forknum] > 0)
return &reln->md_seg_fds[forknum][0];
path = relpath(reln->smgr_rnode, forknum);
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
if (fd < 0)
{
/*
* During bootstrap, there are cases where a system relation will be
* accessed (by internal backend processes) before the bootstrap
* script nominally creates it. Therefore, accept mdopen() as a
* substitute for mdcreate() in bootstrap mode only. (See mdcreate)
*/
if (IsBootstrapProcessingMode())
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
if (fd < 0)
{
if ((behavior & EXTENSION_RETURN_NULL) &&
FILE_POSSIBLY_DELETED(errno))
{
pfree(path);
return NULL;
}
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
}
}
pfree(path);
_fdvec_resize(reln, forknum, 1);
mdfd = &reln->md_seg_fds[forknum][0];
mdfd->mdfd_vfd = fd;
mdfd->mdfd_segno = 0;
Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
return mdfd;
}
第四步:从当前md_seg_fds里面取出的块位置(第一次的话经过第三步应该只有第一块)到需要的块位置targetseg之间的所有块,依次遍历打开。
for (nextsegno = reln->md_num_open_segs[forknum];nextsegno <= targetseg; nextsegno++)
{
(1)计算当前MdfdVec指向的文件,有多少个块
BlockNumber nblocks = _mdnblocks(reln, forknum, v);
(2)按需扩展新文件(表文件切分)
(2)打开文件,维护两个数组结构
v = _mdfd_openseg(reln, forknum, nextsegno, flags);
最重要的两个数组:
typedef struct SMgrRelationData
{
...
int md_num_open_segs[MAX_FORKNUM + 1];
struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
....
} SMgrRelationData;
第一个记录segs号,也就是文件切分后的块号,小于1G的表不切分只有一个文件,对应位置存1。数组的四个位置对应三类文件。
第二个记录fd结构的指针,四个位置放四个指针数组的头部地址,数组的每个位置记录文件的fd和打开的最大的块号。
【简版】
PG中表文件超过1GB会自动拆分,拆分后的fd会变成多个,如何记录呢?
SMgrRelationData结构逻辑上对应一张表,内部负责管理的两个关键数据结构md_num_open_segs、md_seg_fds
简单的说两个数字长度默认都是4,前三个位置分别对应表文件、FSM、VM文件
md_num_open_segs在对应位置上记录当前文件已经打开的最新块号。
md_seg_fds是二维指针,指向多个_MdfdVec*指针数组,_MdfdVec存放了切分后的文件的FD和文件已经使用的最大块号。