从一个实际CASE出发
o1=# \d t1
Table "public.t1"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
i | integer | | |
a | integer | | |
name | text | | |
o1=# insert into t1 values (5, 300, 'gaomingjie');(gdb) bt
#0 heap_form_tuple (tupleDescriptor=0x10485e8, values=0x1048800, isnull=0x1048838 "") at heaptuple.c:701
#1 0x00000000006c703b in ExecCopySlotTuple (slot=0x10484f0) at execTuples.c:564
#2 0x00000000006c72fe in ExecMaterializeSlot (slot=0x10484f0) at execTuples.c:750
#3 0x00000000006e730a in ExecInsert (mtstate=0x1048088, slot=0x10484f0, planSlot=0x10484f0, arbiterIndexes=0x0,
onconflict=ONCONFLICT_NONE, estate=0x1047d20, canSetTag=1 '\001') at nodeModifyTable.c:275
#4 0x00000000006e95bb in ExecModifyTable (pstate=0x1048088) at nodeModifyTable.c:1779
#5 0x00000000006c2ec1 in ExecProcNodeFirst (node=0x1048088) at execProcnode.c:430
#6 0x00000000006bb604 in ExecProcNode (node=0x1048088) at ../../../src/include/executor/executor.h:250
#7 0x00000000006bddd2 in ExecutePlan (estate=0x1047d20, planstate=0x1048088, use_parallel_mode=0 '\000', operation=CMD_INSERT,
sendTuples=0 '\000', numberTuples=0, direction=ForwardScanDirection, dest=0x104f260, execute_once=1 '\001') at execMain.c:1723
#8 0x00000000006bbba6 in standard_ExecutorRun (queryDesc=0xfb8b60, direction=ForwardScanDirection, count=0, execute_once=1 '\001')
at execMain.c:364
#9 0x00000000006bb9d2 in ExecutorRun (queryDesc=0xfb8b60, direction=ForwardScanDirection, count=0, execute_once=1 '\001')
at execMain.c:307
#10 0x000000000087f325 in ProcessQuery (plan=0x104f168, sourceText=0x100aef0 "insert into t1 values (4, 200, 'gaomingjie');", params=0x0,
queryEnv=0x0, dest=0x104f260, completionTag=0x7ffd48dd7040 "") at pquery.c:161
#11 0x0000000000880c1c in PortalRunMulti (portal=0x106dd90, isTopLevel=1 '\001', setHoldSnapshot=0 '\000', dest=0x104f260,
altdest=0x104f260, completionTag=0x7ffd48dd7040 "") at pquery.c:1286
#12 0x0000000000880205 in PortalRun (portal=0x106dd90, count=9223372036854775807, isTopLevel=1 '\001', run_once=1 '\001', dest=0x104f260,
altdest=0x104f260, completionTag=0x7ffd48dd7040 "") at pquery.c:799
#13 0x000000000087a2e3 in exec_simple_query (query_string=0x100aef0 "insert into t1 values (4, 200, 'gaomingjie');") at postgres.c:1122
#14 0x000000000087e4e7 in PostgresMain (argc=1, argv=0xfbb700, dbname=0xfbb5a0 "o1", username=0xfbb578 "mingjiegao") at postgres.c:4142
#15 0x00000000007e0164 in BackendRun (port=0xfb1280) at postmaster.c:4453
#16 0x00000000007df8ec in BackendStartup (port=0xfb1280) at postmaster.c:4117
#17 0x00000000007dbfc3 in ServerLoop () at postmaster.c:1777
#18 0x00000000007db602 in PostmasterMain (argc=3, argv=0xf8dc70) at postmaster.c:1385
#19 0x00000000007195f6 in main (argc=3, argv=0xf8dc70) at main.c:228执行器会把tuple包装成tuple table slot来处理,相当于给HeapTuple包装了一层:TupleTableSlots
代码里会见到很多tts = TupleTableSlots
typedef struct TupleTableSlot
{
NodeTag type;
bool tts_isempty; /* true = slot is empty */
bool tts_shouldFree; /* should pfree tts_tuple? */
bool tts_shouldFreeMin; /* should pfree tts_mintuple? */
bool tts_slow; /* saved state for slot_deform_tuple */
HeapTuple tts_tuple; /* physical tuple, or NULL if virtual */
TupleDesc tts_tupleDescriptor; /* slot's tuple descriptor */
MemoryContext tts_mcxt; /* slot itself is in this context */
Buffer tts_buffer; /* tuple's buffer, or InvalidBuffer */
int tts_nvalid; /* # of valid values in tts_values */
Datum *tts_values; /* current per-attribute values */
bool *tts_isnull; /* current per-attribute isnull flags */
MinimalTuple tts_mintuple; /* minimal tuple, or NULL if none */
HeapTupleData tts_minhdr; /* workspace for minimal-tuple-only case */
long tts_off; /* saved state for slot_deform_tuple */
} TupleTableSlot;对于普通tuple来说,ExecInsert的第一件事就是“物化”Tuple Slot:
【slot】 ----组装----> 【materialized slot】
下面按堆栈顺序分析,重点在step4
static TupleTableSlot *
ExecInsert(ModifyTableState *mtstate,
TupleTableSlot *slot,
TupleTableSlot *planSlot,
List *arbiterIndexes,
OnConflictAction onconflict,
EState *estate,
bool canSetTag)
{
HeapTuple tuple;
ResultRelInfo *resultRelInfo;
Relation resultRelationDesc;
Oid newId;
List *recheckIndexes = NIL;
TupleTableSlot *result = NULL;
/*
* get the heap tuple out of the tuple table slot, making sure we have a
* writable copy
*/
tuple = ExecMaterializeSlot(slot);
...
...Force a slot into the “materialized” state.
HeapTuple
ExecMaterializeSlot(TupleTableSlot *slot)
{
MemoryContext oldContext;
/*
* sanity checks
*/
Assert(slot != NULL);
Assert(!slot->tts_isempty);
/*
* If we have a regular physical tuple, and it's locally palloc'd, we have
* nothing to do.
*/
if (slot->tts_tuple && slot->tts_shouldFree)
return slot->tts_tuple;
/*
* Otherwise, copy or build a physical tuple, and store it into the slot.
*
* We may be called in a context that is shorter-lived than the tuple
* slot, but we have to ensure that the materialized tuple will survive
* anyway.
*/
oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
slot->tts_tuple = ExecCopySlotTuple(slot);
slot->tts_shouldFree = true;
MemoryContextSwitchTo(oldContext);
...
...拼装入口
HeapTuple
ExecCopySlotTuple(TupleTableSlot *slot)
{
...
return heap_form_tuple(slot->tts_tupleDescriptor,
slot->tts_values,
slot->tts_isnull);
}根据values和isnull拼装tuple
/*
* heap_form_tuple
* construct a tuple from the given values[] and isnull[] arrays,
* which are of the length indicated by tupleDescriptor->natts
*
* The result is allocated in the current memory context.
*/
HeapTuple
heap_form_tuple(TupleDesc tupleDescriptor,
Datum *values,
bool *isnull)
{
HeapTuple tuple; /* return tuple */
HeapTupleHeader td; /* tuple data */
Size len,
data_len;
int hoff;
bool hasnull = false;
int numberOfAttributes = tupleDescriptor->natts;
int i;
if (numberOfAttributes > MaxTupleAttributeNumber)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("number of columns (%d) exceeds limit (%d)",
numberOfAttributes, MaxTupleAttributeNumber)));
/*
* Check for nulls
*/
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
{
hasnull = true;
break;
}
}
/*
* Determine total space needed
*/
len = offsetof(HeapTupleHeaderData, t_bits);这里的len=23,头部首先要存HeapTupleHeaderData的上半部分,到null map之前共23个字节:
struct HeapTupleHeaderData
{
union
{
HeapTupleFields t_heap;
DatumTupleFields t_datum;
} t_choice;
ItemPointerData t_ctid; /* current TID of this or newer tuple (or a
* speculative insertion token) */
/* Fields below here must match MinimalTupleData! */
uint16 t_infomask2; /* number of attributes + various flags */
uint16 t_infomask; /* various flag bits, see below */
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
/* ^ - 23 bytes - ^ */
bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */
/* MORE DATA FOLLOWS AT END OF STRUCT */
};注意t_bits使用空数组技巧指向结构体后面的起始内存位置,方便结构体后面存数据。
if (hasnull)
len += BITMAPLEN(numberOfAttributes);
if (tupleDescriptor->tdhasoid)
len += sizeof(Oid);如果有null列,需要bitmap记录,一个字节为最小分配单位开始分配
hoff = len = MAXALIGN(len); /* align user data safely */这里比较有意思,如果没有null列、oid列那么这里会对齐到24字节。
无论如何都会有一个字节的空余,所以如果你有8个列 或 8列以内的null,相当于把这块空间利用起来了,等于免费赠送 : )
data_len = heap_compute_data_size(tupleDescriptor, values, isnull);计算字段长度,这里说下text类型的数据要怎么debug:
values数组对应每个字段,text要强转成varattrib_4b使用
p *(varattrib_4b *)values[2]

len += data_len;
/*
* Allocate and zero the space needed. Note that the tuple body and
* HeapTupleData management structure are allocated in one chunk.
*/
tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
tuple->t_data = td = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);最前面在增加一个HeapTuple的结构:
|-------------------------------| <------ tuple
| HeapTuple |
|-------------------------------| <------ tuple->t_data header->t_hoff = len(head)
| HeapTupleHeader |
|-------------------------------|
| |
| |
| DATA |
| |
| |
|-------------------------------| /*
* And fill in the information. Note we fill the Datum fields even though
* this tuple may never become a Datum. This lets HeapTupleHeaderGetDatum
* identify the tuple type if needed.
*/
tuple->t_len = len;
ItemPointerSetInvalid(&(tuple->t_self));
tuple->t_tableOid = InvalidOid;
HeapTupleHeaderSetDatumLength(td, len);
HeapTupleHeaderSetTypeId(td, tupleDescriptor->tdtypeid);
HeapTupleHeaderSetTypMod(td, tupleDescriptor->tdtypmod);
/* We also make sure that t_ctid is invalid unless explicitly set */
ItemPointerSetInvalid(&(td->t_ctid));
HeapTupleHeaderSetNatts(td, numberOfAttributes);
td->t_hoff = hoff;
if (tupleDescriptor->tdhasoid) /* else leave infomask = 0 */
td->t_infomask = HEAP_HASOID;内存都申请好了,这里开始用values组装tuple
heap_fill_tuple(tupleDescriptor,
values,
isnull,
(char *) td + hoff,
data_len,
&td->t_infomask,
(hasnull ? td->t_bits : NULL));
return tuple;
}根据desc把数据填到正确的位置
void
heap_fill_tuple(TupleDesc tupleDesc,
Datum *values, bool *isnull,
char *data, Size data_size,
uint16 *infomask, bits8 *bit)
{
bits8 *bitP;
int bitmask;
int i;
int numberOfAttributes = tupleDesc->natts;
Form_pg_attribute *att = tupleDesc->attrs;
#ifdef USE_ASSERT_CHECKING
char *start = data;
#endif
if (bit != NULL)
{
bitP = &bit[-1];
bitmask = HIGHBIT;
}
else
{
/* just to keep compiler quiet */
bitP = NULL;
bitmask = 0;
}
*infomask &= ~(HEAP_HASNULL | HEAP_HASVARWIDTH | HEAP_HASEXTERNAL);
for (i = 0; i < numberOfAttributes; i++)
{
Size data_length;
if (bit != NULL)
{
if (bitmask != HIGHBIT)
bitmask <<= 1;
else
{
bitP += 1;
*bitP = 0x0;
bitmask = 1;
}
if (isnull[i])
{
*infomask |= HEAP_HASNULL;
continue;
}
*bitP |= bitmask;
}
/*
* XXX we use the att_align macros on the pointer value itself, not on
* an offset. This is a bit of a hack.
*/整形数据会走这个分支:
数据已经在values中了,内部主要是一些长度截断操作
if (att[i]->attbyval)
{
/* pass-by-value */
data = (char *) att_align_nominal(data, att[i]->attalign);
store_att_byval(data, values[i], att[i]->attlen);
data_length = att[i]->attlen;
}text类型数据会走这里内部数据会memcpy到data中
else if (att[i]->attlen == -1)
{
/* varlena */
Pointer val = DatumGetPointer(values[i]);
*infomask |= HEAP_HASVARWIDTH;
if (VARATT_IS_EXTERNAL(val))
{
if (VARATT_IS_EXTERNAL_EXPANDED(val))
{
/*
* we want to flatten the expanded value so that the
* constructed tuple doesn't depend on it
*/
ExpandedObjectHeader *eoh = DatumGetEOHP(values[i]);
data = (char *) att_align_nominal(data,
att[i]->attalign);
data_length = EOH_get_flat_size(eoh);
EOH_flatten_into(eoh, data, data_length);
}
else
{
*infomask |= HEAP_HASEXTERNAL;
/* no alignment, since it's short by definition */
data_length = VARSIZE_EXTERNAL(val);
memcpy(data, val, data_length);
}
}
else if (VARATT_IS_SHORT(val))
{
/* no alignment for short varlenas */
data_length = VARSIZE_SHORT(val);
memcpy(data, val, data_length);
}
else if (VARLENA_ATT_IS_PACKABLE(att[i]) &&
VARATT_CAN_MAKE_SHORT(val))
{
/* convert to short varlena -- no alignment */
data_length = VARATT_CONVERTED_SHORT_SIZE(val);
SET_VARSIZE_SHORT(data, data_length);
memcpy(data + 1, VARDATA(val), data_length - 1);
}
else
{
/* full 4-byte header varlena */
data = (char *) att_align_nominal(data,
att[i]->attalign);
data_length = VARSIZE(val);
memcpy(data, val, data_length);
}
}
else if (att[i]->attlen == -2)
{
/* cstring ... never needs alignment */
*infomask |= HEAP_HASVARWIDTH;
Assert(att[i]->attalign == 'c');
data_length = strlen(DatumGetCString(values[i])) + 1;
memcpy(data, DatumGetPointer(values[i]), data_length);
}
else
{
/* fixed-length pass-by-reference */
data = (char *) att_align_nominal(data, att[i]->attalign);
Assert(att[i]->attlen > 0);
data_length = att[i]->attlen;
memcpy(data, DatumGetPointer(values[i]), data_length);
}
data += data_length;
}
Assert((data - start) == data_size);
}