测试SQL
BEGIN;
PREPARE sel(integer) AS
SELECT count(*)
FROM contend
WHERE id BETWEEN $1 AND $1 + 100;
PREPARE upd(integer) AS
UPDATE contend SET val = val + 1
WHERE id IN ($1, $1 + 10, $1 + 20, $1 + 30);
SAVEPOINT a;
\set rnd random(1,990)
EXECUTE sel(10 * :rnd + :client_id + 1);
EXECUTE upd(10 * :rnd + :client_id);
SAVEPOINT a;
\set rnd random(1,990)
EXECUTE sel(10 * :rnd + :client_id + 1);
EXECUTE upd(10 * :rnd + :client_id);
...
SAVEPOINT a;
\set rnd random(1,990)
EXECUTE sel(10 * :rnd + :client_id + 1);
EXECUTE upd(10 * :rnd + :client_id);
DEALLOCATE ALL;
COMMIT;
tps = 167.382164
pgbench -f subtrans.sql -n -c 6 -T 600
transaction type: subtrans.sql
scaling factor: 1
query mode: simple
number of clients: 6
number of threads: 1
duration: 600 s
number of transactions actually processed: 100434
latency average = 35.846 ms
tps = 167.382164 (including connections establishing)
tps = 167.383187 (excluding connections establishing)
perf top
+ 1.86% [.] tbm_iterate
+ 1.77% [.] hash_search_with_hash_value
1.75% [.] AllocSetAlloc
+ 1.36% [.] pg_qsort
+ 1.12% [.] base_yyparse
+ 1.10% [.] TransactionIdIsCurrentTransactionId
+ 0.96% [.] heap_hot_search_buffer
+ 0.96% [.] LWLockAttemptLock
+ 0.85% [.] HeapTupleSatisfiesVisibility
+ 0.82% [.] heap_page_prune
+ 0.81% [.] ExecInterpExpr
+ 0.80% [.] SearchCatCache1
+ 0.79% [.] BitmapHeapNext
+ 0.64% [.] LWLockRelease
+ 0.62% [.] MemoryContextAllocZeroAligned
+ 0.55% [.] _bt_checkkeys
0.54% [.] hash_any
+ 0.52% [.] _bt_compare
0.51% [.] ExecScan
tps = 68.993634
pgbench -f subtrans.sql -n -c 6 -T 600
transaction type: subtrans.sql
scaling factor: 1
query mode: simple
number of clients: 6
number of threads: 1
duration: 600 s
number of transactions actually processed: 41400
latency average = 86.965 ms
tps = 68.993634 (including connections establishing)
tps = 68.993993 (excluding connections establishing)
perf top
+ 10.59% [.] LWLockAttemptLock
+ 7.12% [.] LWLockRelease
+ 2.70% [.] LWLockAcquire
+ 2.40% [.] SimpleLruReadPage_ReadOnly
+ 1.30% [.] TransactionIdIsCurrentTransactionId
+ 1.26% [.] tbm_iterate
+ 1.22% [.] hash_search_with_hash_value
+ 1.08% [.] AllocSetAlloc
+ 0.77% [.] heap_hot_search_buffer
+ 0.72% [.] pg_qsort
+ 0.72% [.] base_yyparse
+ 0.66% [.] SubTransGetParent
+ 0.62% [.] HeapTupleSatisfiesVisibility
+ 0.54% [.] ExecInterpExpr
+ 0.51% [.] SearchCatCache1
《数据库系统实现》7.8.5 多版本时间戳
postgres=# select txid_current_snapshot();
txid_current_snapshot
---------------------------------
1835847:1835853:1835847,1835849
postgres=# select txid_current();
txid_current
--------------
1835853
txid_current_snapshot is xmin:xmax:xip_list
理解这三个值得含义
PG中使用行版本来判断可见性,每行数据都记录了创建、删除的XID;如果记录的XID是子事务的XID,还需要查询子事务的状态确认事务ID是否有效。
默认的RC隔离级别下,每条语句都会重新拿快照:
Snapshot
GetSnapshotData(Snapshot snapshot)
{
...
return snapshot;
}
对于事务可见性判断具体会拿到下面三个数据:
typedef struct SnapshotData
{
...
TransactionId xmin; /* all XID < xmin are visible to me */
TransactionId xmax; /* all XID >= xmax are invisible to me */
TransactionId *xip;
...
CommandId curcid; /* in my xact, CID < curcid are visible */
...
}
快照的获取需要遍历整个process array(在共享内存中记录了所有正在运行的backend信息),每一个session在某一时刻只能有一个确定的XID,但由于子事务的原因,每个session可以额外记录64个活跃的子事务状态信息。
执行SQL
begin; -- 1835915
insert into t1 values (1,1);
insert into t1 values (2,1);
delete from t1 where a = 2;
savepoint a;
insert into t1 values (3,1);
savepoint b;
insert into t1 values (4,1);
savepoint c;
insert into t1 values (5,1);
rollback to savepoint c;
select * from t1;
NOTICE: snapshot->curcid: 6
a | b
---+---
1 | 1
3 | 1
4 | 1
select txid_current();
txid_current
--------------
1835915
内存状态PGPROC
p *MyProc
{
pid = 10219,
pgprocno = 99,
backendId = 3,
databaseId = 13212,
roleId = 10,
subxids = {xids = {1835916, 1835917, 1835918, 1835883, 0 <repeats 60 times>}},
...
}
内存状态PGXACT
p *MyPgXact
{
xid = 1835915,
xmin = 1835915,
overflowed = 0,
nxids = 2
...
}
内存状态CurrentTransactionState
p *CurrentTransactionState
{
transactionId = 0,
subTransactionId = 5,
name = 0x2a9c3d0 "c",
state = TRANS_INPROGRESS,
blockState = TBLOCK_SUBINPROGRESS,
nestingLevel = 4,
parent = 0x2a9b980}
p *CurrentTransactionState->parent
{
transactionId = 1835917,
subTransactionId = 3,
name = 0x2a9c1f8 "b",
state = TRANS_INPROGRESS,
blockState = TBLOCK_SUBINPROGRESS,
nestingLevel = 3,
parent = 0x2a9b818
}
p *CurrentTransactionState->parent->parent
{
transactionId = 1835916,
subTransactionId = 2,
name = 0x2a9be30 "a",
state = TRANS_INPROGRESS,
blockState = TBLOCK_SUBINPROGRESS,
nestingLevel = 2,
parent = 0xf16580 <TopTransactionStateData>
}
p *CurrentTransactionState->parent->parent->parent
{
transactionId = 1835915,
subTransactionId = 1,
name = 0x0,
state = TRANS_INPROGRESS,
blockState = TBLOCK_INPROGRESS,
nestingLevel = 1,
parent = 0x0
}
MVCC判断位置
/* ----------------------------- */
/* 标准查询
/* ----------------------------- */
exec_simple_query
PortalRun
ExecScan
heapgettup_pagemode
heapgetpage
/* ----------------------------- */
/* 标准查询:MVCC判断
/* ----------------------------- */
heapgetpage
// 缓存读,返回pin & unlock状态的buffer
ReadBufferExtended
LockBuffer(buffer, BUFFER_LOCK_SHARE);
// 遍历所有元组!(所以表膨胀对性能会有影响)
for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); lineoff <= lines; lineoff++, lpp++)
...
// 可见性判断入口函数
valid = HeapTupleSatisfiesVisibility
// 记录可见的元组
if (valid) scan->rs_vistuples[ntup++] = lineoff;
...
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
scan->rs_ntuples = ntup;
HeapTupleSatisfiesMVCC
/* ----------------------------- */
/* MVCC Satisfies
/* ----------------------------- */
HeapTupleSatisfiesVisibility
HeapTupleSatisfiesMVCC
if (!HeapTupleHeaderXminCommitted(tuple)) // 【判断1】创建者没有提交
if (HeapTupleHeaderXminInvalid(tuple)) // 【判断1.1】创建者的事务ID已经回滚了
return false;
...
else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) // 【判断1.2】当前事务产生的元组
... // 当前事务内部使用ctid判断可见性
if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) // 【判断1.2.1】insert发生在scan之后
return false;
if (tuple->t_infomask & HEAP_XMAX_INVALID) // 【判断1.2.2】没有被删除
return true;
if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) // 【判断1.2.3】删除了;删除和读谁比较早?
return true; // 先读 后删除
else
return false; // 先删除 后读
...
else if (XidInMVCCSnapshot...) // (性能问题)【判断1.3】非当前事务产生的元组,事务ID在当前快照中
else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) // 【判断1.4】非当前事务产生的元组;创建者已经提交了
...
else
return false;
...
...
可见性判断过程:
a | b command id xmin xmax t_infomask 可见性判断流程
---+---
1 | 1 0 1835915 0 2048 = 0x800 = HEAP_XMAX_INVALID 可见:1 -> 1.2 -> 1.2.2
(2 | 1) 1 1835915 1835915 32 = 0x20 = HEAP_COMBOCID 不可见:1 -> 1.2 -> 1.2.3
3 | 1 2 1835916 0 2048 = 0x800 = HEAP_XMAX_INVALID 可见:1 -> 1.2 -> 1.2.2
4 | 1 3 1835917 0 2048 = 0x800 = HEAP_XMAX_INVALID 可见:1 -> 1.2 -> 1.2.2
(5 | 1) 4 1835918 0 2560 = 0xA00 = HEAP_XMIN_INVALID 不可见:1 -> 1.1
HEAP_XMAX_INVALID
TransactionIdIsCurrentTransactionId如何判断数据是当前事务产生的?
遍历CurrentTransactionState list,如果找到创建元组的xmin说明是当前事务产生的
性能问题的根源:
HeapTupleSatisfiesMVCC拿到一个元组的xmin时,并不知道该XID是子事务ID还是事务ID,例如上面数据(5|1)是事务1835918创建的:
(1)如果快照中保存了完整的事务信息(子事务少于64个),可以通过快照保存的子事务信息。
(2)如果快照的子事务信息溢出了,那么需要去遍历SLRU页面,通过自己的XID一层一层的找到顶层事务ID用户判断可见性。
1835918->1835917->1835916->1835915
XidInMVCCSnapshot
...
if (!snapshot->suboverflowed)
{
/* we have full data, so search subxip */
int32 j;
for (j = 0; j < snapshot->subxcnt; j++)
{
if (TransactionIdEquals(xid, snapshot->subxip[j]))
return true;
}
/* not there, fall through to search xip[] */
}
else
{
// SubTransGetTopmostTransaction --> SubTransGetParent
// --> SimpleLruReadPage_ReadOnly --> (control LW_EXCLUSIVE) --> SimpleLruReadPage
// --> SlruSelectLRUPage --> (bufer_locks LW_EXCLUSIVE) SimpleLruWaitIO --> ...
xid = SubTransGetTopmostTransaction(xid);
if (TransactionIdPrecedes(xid, snapshot->xmin))
return false;
}
...
Min(128, Max(4, NBuffers / 512))
:4-128个