写操作如何获取buffer与页面扩展
PG的insert和update操作都会用到RelationGetBufferForTuple函数,例如insert:
heap_insert
// [1] sets the tuple header fields, assigns an OID, and toasts the tuple if necessary
heap_prepare_insert
// [2] Returns pinned and exclusive-locked buffer of a page in given relation with free space >= given len
RelationGetBufferForTuple
// [3] place tuple at specified page
RelationPutHeapTuple
// [4] xlog stuff
// [5] mark it for invalidation from the caches
CacheInvalidateHeapTuple
RelationGetBufferForTuple
单insert常规流程,不考虑bulk场景和heap_update调入的场景,双页场景在heap_update系列在分析。
// 从SMGR拿当前使用的页面
targetBlock = RelationGetTargetBlock(relation) : (relation)->rd_smgr->smgr_targblockReadBufferBI
// 循环指导拿到一个页面
while (targetBlock != InvalidBlockNumber)
// 写锁
LockBuffer(BUFFER_LOCK_EXCLUSIVE)
// 拿页面指针
BufferGetPage
// 拿空闲空间
PageGetHeapFreeSpace
// 空闲空间够用
if (len + saveFreeSpace <= pageFreeSpace)
RelationSetTargetBlock(relation, targetBlock);
return buffer;
// 从SMGR拿当前使用的页面,肯定能拿到,空间够不够不关注
targetBlock = RelationGetTargetBlock(relation) : (relation)->rd_smgr->smgr_targblockReadBufferBI
while (targetBlock != InvalidBlockNumber)
// 不管空间够不够先锁
LockBuffer(BUFFER_LOCK_EXCLUSIVE)
// 拿页面指针
BufferGetPage
// 拿空闲空间
PageGetHeapFreeSpace
// 空闲空间 不够!
if (80 + 0 > 16)
if (len + saveFreeSpace <= pageFreeSpace)
// RelationSetTargetBlock(relation, targetBlock);
// return buffer;
// 先放锁、unpin
LockBuffer(BUFFER_LOCK_UNLOCK)
ReleaseBuffer(buffer)
// 更新fsm之后再查询一次,没找到返回InvalidBlockNumber
targetBlock = RecordAndGetPageWithFreeSpace(relation, targetBlock, pageFreeSpace, len + saveFreeSpace)
// 【【表写锁】】
!ConditionalLockRelationForExtension(relation, ExclusiveLock)
// 用P_NEW读buffer
ReadBufferBI(relation, P_NEW, bistate)
// 【写锁】
LockBuffer(BUFFER_LOCK_EXCLUSIVE)
// 【【放表写锁】】
UnlockRelationForExtension(relation, ExclusiveLock)
BufferGetPage
PageInit
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer))
return buffer
// 从SMGR拿当前使用的页面,肯定能拿到,空间够不够不关注
targetBlock = RelationGetTargetBlock(relation) : (relation)->rd_smgr->smgr_targblockReadBufferBI
while (targetBlock != InvalidBlockNumber)
// 不管空间够不够先锁
LockBuffer(BUFFER_LOCK_EXCLUSIVE)
// 拿页面指针
BufferGetPage
// 拿空闲空间
PageGetHeapFreeSpace
// 空闲空间 不够!
if (80 + 0 > 16)
if (len + saveFreeSpace <= pageFreeSpace)
// RelationSetTargetBlock(relation, targetBlock);
// return buffer;
// 先放锁、unpin
LockBuffer(BUFFER_LOCK_UNLOCK)
ReleaseBuffer(buffer)
// 更新fsm之后再查询一次,没找到返回InvalidBlockNumber
targetBlock = RecordAndGetPageWithFreeSpace(relation, targetBlock, pageFreeSpace, len + saveFreeSpace)
// 【【表写锁】】try失败
if (!ConditionalLockRelationForExtension(relation, ExclusiveLock))
// 等别人用完
LockRelationForExtension
// 如果别人扩展了,就不用再扩展了
targetBlock = GetPageWithFreeSpace
if (targetBlock != InvalidBlockNumber)
goto loop
// *** 批量扩展
RelationAddExtraBlocks
// 用P_NEW读buffer
ReadBufferBI(relation, P_NEW, bistate)
// 【写锁】
LockBuffer(BUFFER_LOCK_EXCLUSIVE)
// 【【放表写锁】】
UnlockRelationForExtension(relation, ExclusiveLock)
BufferGetPage
PageInit
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer))
return buffer
static void
RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
{
// 几个人在等锁?
lockWaiters = RelationExtensionLockWaiterCount(relation);
if (lockWaiters <= 0)
return;
// 扩展至少512个页面
extraBlocks = Min(512, lockWaiters * 20);
while (extraBlocks-- >= 0)
{
// 循环一次扩展一个
buffer = ReadBufferBI(relation, P_NEW, bistate);
// 初始化
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buffer);
PageInit(page, BufferGetPageSize(buffer), 0);
MarkBufferDirty(buffer);
blockNum = BufferGetBlockNumber(buffer);
freespace = PageGetHeapFreeSpace(page);
UnlockReleaseBuffer(buffer);
// 记录第一个页面
if (firstBlock == InvalidBlockNumber)
firstBlock = blockNum;
// 更新FSM
RecordPageWithFreeSpace(relation, blockNum, freespace);
}
// 更新FSM
UpdateFreeSpaceMap(relation, firstBlock, blockNum, freespace);
}
ReadBuffer_common(...,P_NEW,...)
整体流程总结:
1、使用smgrnblocks当做新的页面ID。
2、用新页ID查hash查不到,clocksweep新分配一个。found=false
3、memset清空内存页面,smgr extend打开文件新页面的问题,写入清空的页面。
4、配置bmvalid标志返回
static Buffer
ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
BufferAccessStrategy strategy, bool *hit)
{
...
isExtend = (blockNum == P_NEW);
...
if (isExtend)
// 拿一个新的页面,比如0、1已经使用,当前有两个页面,smgrnblocks=2
blockNum = smgrnblocks(smgr, forkNum);
...
// 1、上面拿一个新的blockNum哈希表肯定搜不到tag,clocksweep拿一个页面
// 2、初始化desc:buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE
// 3、返回found=false
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, strategy, &found);
if (found)
{
...
}
// 拿到8k指针
bufBlock = BufHdrGetBlock(bufHdr);
if (isExtend)
{
// 清空8k
MemSet((char *) bufBlock, 0, BLCKSZ);
// 文件扩展
smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
mdextend
// 找到第几个1G的文件
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
// 如果是第二个页面,seekpos=16384
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
// 文件读
FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos;
// 写入
FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND);
}
else
{
...
}
TerminateBufferIO(bufHdr, false, BM_VALID);
...
return BufferDescriptorGetBuffer(bufHdr);
}
第一步拿到空8k,这里先初始化页面头。
PageInit(Page page, Size pageSize, Size specialSize)
{
PageHeader p = (PageHeader) page;
specialSize = MAXALIGN(specialSize);
Assert(pageSize == BLCKSZ);
Assert(pageSize > specialSize + SizeOfPageHeaderData);
/* Make sure all fields of page are zero, as well as unused space */
MemSet(p, 0, pageSize);
p->pd_flags = 0;
p->pd_lower = SizeOfPageHeaderData;
p->pd_upper = pageSize - specialSize;
p->pd_special = pageSize - specialSize;
PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION);
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
GetPageWithFreeSpace
if (targetBlock == InvalidBlockNumber && use_fsm)
{
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
if (targetBlock == InvalidBlockNumber)
{
BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
if (nblocks > 0)
targetBlock = nblocks - 1;
}
}
GetPageWithFreeSpace函数是fsm提供的函数用来找到一个空闲页面,至少能容纳len的内容。
fsmpage->fp_next_slot
开始爬,这是上一次查询找到合适页面时配置的。
fp_next_slot
,当前不满足就开始move right,然后move up。(31+N)/32
,每一个数据页面对应一个FSM字节。所以进入fsm的第一步就是做一个转换,把size/32后进入后续判断。
这是一个爬树的实例:
7
7 6
5 7 6 5
4 5 5 7 2 6 5 2
T
Assume that the target node is the node indicated by the letter T,
and we're searching for a node with value of 6 or higher. The search
begins at T. At the first iteration, we move to the right, then to the
parent, arriving at the rightmost 5. At the second iteration, we move
to the right, wrapping around, then climb up, arriving at the 7 on the
third level. 7 satisfies our search, so we descend down to the bottom,
following the path of sevens. This is in fact the first suitable page
to the right of (allowing for wraparound) our start point.