前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Postgre物理Page分析

Postgre物理Page分析

作者头像
用户4700054
发布2022-08-17 12:31:37
6600
发布2022-08-17 12:31:37
举报
文章被收录于专栏:存储内核技术交流

pageinspect分析工具

  • 编译安装postgres extension
代码语言:javascript
复制
// 安装 postgres extension
[root@centos-linux ~]$ mkdir /usr/local/pgsql &&  chown -R perrynzhou:perrynzhou /usr/local/pgsql && chmod -R 755 /usr/local/pgsql

[perrynzhou@centos-linux ~]$cd /home/perrynzhou/postgresql-14rc1 && env CFLAGS="-g -O0" ./configure
[perrynzhou@centos-linux ~]$c cd /home/perrynzhou/postgresql-14rc1/contrib &&  make && make install


// 安装完毕后的extension
[perrynzhou@centos-linux ~]$ ls -l /usr/local/pgsql/bin/
total 192
-rwxr-xr-x. 1 perrynzhou perrynzhou 95896 Nov 17 00:06 oid2name
-rwxr-xr-x. 1 perrynzhou perrynzhou 94632 Nov 17 00:06 vacuumlo
[perrynzhou@centos-linux ~]$ ls -l /usr/local/pgsql/lib/
total 4952
-rwxr-xr-x. 1 perrynzhou perrynzhou  70448 Nov 17 00:06 adminpack.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 173784 Nov 17 00:06 amcheck.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  24072 Nov 17 00:06 auth_delay.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  63152 Nov 17 00:06 auto_explain.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36720 Nov 17 00:06 autoinc.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 195416 Nov 17 00:06 bloom.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  79536 Nov 17 00:06 btree_gin.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 619560 Nov 17 00:06 btree_gist.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36792 Nov 17 00:06 citext.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 120040 Nov 17 00:06 cube.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 134000 Nov 17 00:06 dblink.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  28240 Nov 17 00:06 dict_int.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  31208 Nov 17 00:06 dict_xsyn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  12208 Nov 17 00:06 earthdistance.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 100744 Nov 17 00:06 file_fdw.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  61136 Nov 17 00:06 fuzzystrmatch.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 229144 Nov 17 00:06 hstore.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36376 Nov 17 00:06 insert_username.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 170080 Nov 17 00:06 _int.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 130848 Nov 17 00:06 isn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36168 Nov 17 00:06 lo.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 192416 Nov 17 00:06 ltree.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36416 Nov 17 00:06 moddatetime.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  46952 Nov 17 00:06 old_snapshot.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 269768 Nov 17 00:06 pageinspect.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  16008 Nov 17 00:06 passwordcheck.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  47944 Nov 17 00:06 pg_buffercache.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 453664 Nov 17 00:06 pgcrypto.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  22744 Nov 17 00:06 pg_freespacemap.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  78784 Nov 17 00:06 pg_prewarm.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  70200 Nov 17 00:06 pgrowlocks.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 116472 Nov 17 00:06 pg_stat_statements.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 153928 Nov 17 00:06 pgstattuple.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  50928 Nov 17 00:06 pg_surgery.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 142808 Nov 17 00:06 pg_trgm.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  73536 Nov 17 00:06 pg_visibility.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 376072 Nov 17 00:06 postgres_fdw.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  49984 Nov 17 00:06 refint.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  82760 Nov 17 00:06 seg.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  74720 Nov 17 00:06 tablefunc.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  38896 Nov 17 00:06 tcn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  68280 Nov 17 00:06 test_decoding.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  55976 Nov 17 00:06 tsm_system_rows.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  56520 Nov 17 00:06 tsm_system_time.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  40912 Nov 17 00:06 unaccent.so
  • 启动数据库并加载
代码语言:javascript
复制
// 启动需要取消auto vacuum
[perrynzhou@centos-linux ~]$ pg_ctl -D /postgres/data -l logfile stop
[perrynzhou@centos-linux ~]$ pg_ctl -D /postgres/data -l logfile start
[perrynzhou@centos-linux ~]$ psql -h 127.0.0.1 -d sampledb

// 加载extension
sampledb=# create extension pageinspect;
CREATE EXTENSION

// 创建测试表
sampledb=# create table happy(id int,name varchar);
CREATE TABLE

物理也分析

  • 表的隐藏列的含义和分析,PG一个表包含了tableoid/cmax/xmax/cmin/xmin/ctid这几个隐藏列,其中tableoid是表文件的唯一标识,xmin代表当向表中插入一行数据的的事务ID;xmax字段如果值为0,标识这一行的数据没有被删除,如果表中的这行数据被删除,xmax的值就是执行这行数据删除的事务ID;cmin是插入事务内命令行标识;cmax是删除事务内命令行标识;这些标识用于MVCC中判断row是否对于其他事务可见
代码语言:javascript
复制
sampledb=# \d happy
                    Table "public.happy"
 Column |       Type        | Collation | Nullable | Default 
--------+-------------------+-----------+----------+---------
 id     | integer           |           |          | 
 name   | character varying |           |          | 

sampledb=# begin;
BEGIN

// 查看表的隐藏列
sampledb=# select  attname, format_type (atttypid, atttypmod) from  pg_attribute as a,pg_class as b  where a.attrelid = b.oid and b.relname='happy';
 attname  |    format_type    
----------+-------------------
 tableoid | oid
 cmax     | cid
 xmax     | xid
 cmin     | cid
 xmin     | xid
 ctid     | tid
 id       | integer
 name     | character varying
 
 // 开始插入一条数据,xmin是执行插入语句事务的ID,xmax是为0
 
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# select xmin,xmax,cmin,cmax,* from  happy;
 xmin | xmax | cmin | cmax |   id   |               name               
------+------+------+------+--------+----------------------------------
 5823 |    0 |    0 |    0 | 739849 | 26297c7c1d030e111601a1ce070eae5f
(1 row)

// 删除这条数据,然后回滚观察xmax的值

sampledb=*# delete from happy;
DELETE 1
sampledb=*# rollback;
ROLLBACK

sampledb=# select xmin,xmax,cmin,cmax,* from  happy;
 xmin | xmax | cmin | cmax |  id   |               name               
------+------+------+------+-------+----------------------------------
 5828 | 5829 |    0 |    0 | 52794 | 616bccad918d0b2de733f544bc4a6eb7
  • 表中插入数据,通过pageinspect分析可以看出t_xmin和t_xmax,t_xmin对应的是隐藏字段xmin,t_xmax对应是隐藏字段xmax。insert数据后,t_xmin为当前插入事务的ID。该记录没有任何的更新t_xmax也是为0
代码语言:javascript
复制
// 开启一个事务
sampledb=# begin;
BEGIN

// 插入三条记录
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1

// 通过工具分析该表的物理page,可以看出可以把数据打印出来
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
   5831 |      0 | {"\\x26f80300","\\x436565633366636139643261356639386266646434343833623235386334306266"}
   5831 |      0 | {"\\x865d0100","\\x433262663962343761376665643665343363333033373837313730333263396261"}
   5831 |      0 | {"\\x6dc10300","\\x433963306533616439363931316362326261386332333031663530333339313037"}
(3 rows)
  • 表中更新三条记录,通过pageinspect工具分析看出PG中的表更新是先把原来的记录标记为删除(xmin=xmax设置为事务5831),接着插入三条记录,从新设置这三条记录xmin和xmax(xmin=当前事务ID5831,xmax=0).这里想想如果vacuum(非完全清理)如果做的不及时,page中大量需要清理的dead 记录。vacuum做完会导致page中的记录被预留下来,为后面插入做准备,磁盘空间依然会预留下来。如果一个表更新的记录非常多,但是后面插入的非常少,这样就会导致表很大,但是实际的记录就非常少;如果做全完清理的vacuum,会释放dead 记录的磁盘空间
代码语言:javascript
复制
// 查询插入的三条记录,之前看到的t_min设置为了插入事务的ID,t_max为0
sampledb=*# select * from happy;
   id   |               name               
--------+----------------------------------
 260134 | eec3fca9d2a5f98bfdd4483b258c40bf
  89478 | 2bf9b47a7fed6e43c30378717032c9ba
 246125 | 9c0e3ad96911cb2ba8c2301f50339107
(3 rows)

// 更新该表的所有记录
sampledb=*# update happy set name=md5(random()::text)  where id>0;
UPDATE 3

// 更新后的数据
sampledb=*# select * from happy;
   id   |               name               
--------+----------------------------------
 260134 | 39b9970f54f6ac4b709dabcd1cb7516e
  89478 | 99277b85a7a705db61d24e77ca70589e
 246125 | 42a1b0db63c95f6221526b04ce1cafaf
(3 rows)

// 通过工具查看表的t_xmin和t_xmax
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
   5831 |   5831 | {"\\x26f80300","\\x436565633366636139643261356639386266646434343833623235386334306266"}
   5831 |   5831 | {"\\x865d0100","\\x433262663962343761376665643665343363333033373837313730333263396261"}
   5831 |   5831 | {"\\x6dc10300","\\x433963306533616439363931316362326261386332333031663530333339313037"}
   5831 |      0 | {"\\x26f80300","\\x433339623939373066353466366163346237303964616263643163623735313665"}
   5831 |      0 | {"\\x865d0100","\\x433939323737623835613761373035646236316432346537376361373035383965"}
   5831 |      0 | {"\\x6dc10300","\\x433432613162306462363363393566363232313532366230346365316361666166"}
(6 rows)
  • 更新记录后,分别执行非完全清理的vacuum和完全清理的vacuum.做完全清理vacuum,数据会把dead记录物理page空间归还给OS,会留实际有效记录的空间;如果做非完全清理的vacuum,存在dead记录仅仅把记录的数据清理了,dead占用物理page的磁盘空间依然保留着。
代码语言:javascript
复制
// 清空表的所有page,并插入三条记录
sampledb=# truncate happy;
TRUNCATE TABLE
sampledb=# insert into happy values(260134,'39b9970f54f6ac4b709dabcd1cb7516e');
INSERT 0 1
sampledb=# insert into happy values(89478,'99277b85a7a705db61d24e77ca70589e');
INSERT 0 1
sampledb=# insert into happy values(246125,'42a1b0db63c95f6221526b04ce1cafaf');
INSERT 0 1
sampledb=# update happy set name=md5(random()::text)  where id>0;
UPDATE 3
sampledb=# select * from happy;
   id   |               name               
--------+----------------------------------
 260134 | ab95cb5ab40f9a9a99d41667b77e6e39
  89478 | 4355842e9bbfefdedb6fa26e7d5734dd
 246125 | bc4b418a4ffa286e183310c35b3d90c3
(3 rows)


// 更新后表中page仍然有6条记录,三条dead 记录;三条新插入的记录
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
   5839 |   5842 | {"\\x26f80300","\\x433339623939373066353466366163346237303964616263643163623735313665"}
   5840 |   5842 | {"\\x865d0100","\\x433939323737623835613761373035646236316432346537376361373035383965"}
   5841 |   5842 | {"\\x6dc10300","\\x433432613162306462363363393566363232313532366230346365316361666166"}
   5842 |      0 | {"\\x26f80300","\\x436162393563623561623430663961396139396434313636376237376536653339"}
   5842 |      0 | {"\\x865d0100","\\x433433353538343265396262666566646564623666613236653764353733346464"}
   5842 |      0 | {"\\x6dc10300","\\x436263346234313861346666613238366531383333313063333562336439306333"}
(6 rows)

// 查看表的大小,目前是是在一个page内
sampledb=# select relpages, relpages*8192 as total_bytes, pg_relation_size('public.happy') as relsize  from pg_class where relname = 'happy';
 relpages | total_bytes | relsize 
----------+-------------+---------
        0 |           0 |    8192
(1 row)


// 针对表做非完全清理的vacuum
sampledb=# vacuum happy;
VACUUM

// pageinspect工具分析,可以看出原来的dead记录数据被清理,但是空间依然保留着
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
        |        | 
        |        | 
        |        | 
   5842 |      0 | {"\\x26f80300","\\x436162393563623561623430663961396139396434313636376237376536653339"}
   5842 |      0 | {"\\x865d0100","\\x433433353538343265396262666566646564623666613236653764353733346464"}
   5842 |      0 | {"\\x6dc10300","\\x436263346234313861346666613238366531383333313063333562336439306333"}
(6 rows)

// 当前已经使用的page大小
sampledb=#  select relpages, relpages*8192 as total_bytes, pg_relation_size('public.happy') as relsize  from pg_class where relname = 'happy';
 relpages | total_bytes | relsize 
----------+-------------+---------
        1 |        8192 |    8192
(1 row)

// 做完全清理的vacuum
sampledb=# vacuum full happy;
VACUUM

// 查询做完后的该表的磁盘空间,通过工具可以看到这个表的dead记录空闲出来的空间被回收了,仅仅只有三条记录的空间
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
   5842 |      0 | {"\\x26f80300","\\x436162393563623561623430663961396139396434313636376237376536653339"}
   5842 |      0 | {"\\x865d0100","\\x433433353538343265396262666566646564623666613236653764353733346464"}
   5842 |      0 | {"\\x6dc10300","\\x436263346234313861346666613238366531383333313063333562336439306333"}
(3 rows)
  • 删除该表的数据,依然会更新t_xmax为执行删除操作事务的ID。如果在做vacuum happy,会回收该表中已经删除记录的空间。如果PG配置auto vacuum,会定期做vacuum,清理物理page空间。但是如果上层应用大量的删除和更新操作,vacuum未必能做的过来,会导致表过渡膨胀,dead记录依然会占用很多空间。这样是当前PG存储引擎存在的问题。社区在开发一款新的引擎zheap,参考oracle的设计,使用undo来设置mvcc.这样dead记录不会和有效数据存储在一起,能解决这个问题。
代码语言:javascript
复制
// 同一个事务内插入三条记录,然后进行删除
sampledb=# begin;
BEGIN
sampledb=*# insert into happy values(260134,'39b9970f54f6ac4b709dabcd1cb7516e');
INSERT 0 1
sampledb=*# insert into happy values(89478,'99277b85a7a705db61d24e77ca70589e');
INSERT 0 1
sampledb=*# insert into happy values(246125,'42a1b0db63c95f6221526b04ce1cafaf');
INSERT 0 1

// 插入三条记录的t_xmin等于执行插入事务的ID,t_xmax=0
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
   5846 |      0 | {"\\x26f80300","\\x433339623939373066353466366163346237303964616263643163623735313665"}
   5846 |      0 | {"\\x865d0100","\\x433939323737623835613761373035646236316432346537376361373035383965"}
   5846 |      0 | {"\\x6dc10300","\\x433432613162306462363363393566363232313532366230346365316361666166"}
(3 rows)

// 删除数据表的记录
sampledb=*# delete from happy;
DELETE 3

// 删除后数据t_xmin=t_xmax等于当前事务ID(5846)
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
--------+--------+-----------------------------------------------------------------------------------------
   5846 |   5846 | {"\\x26f80300","\\x433339623939373066353466366163346237303964616263643163623735313665"}
   5846 |   5846 | {"\\x865d0100","\\x433939323737623835613761373035646236316432346537376361373035383965"}
   5846 |   5846 | {"\\x6dc10300","\\x433432613162306462363363393566363232313532366230346365316361666166"}
(3 rows)

// 事务提交
sampledb=*# commit;
COMMIT

// 执行非完全清理的vacuum
sampledb=# vacuum happy;
VACUUM

// 再次查询发现事务的物理空间被回收了,执行删除操作这个表没有占用任何的物理page
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
ERROR:  block number 0 is out of range for relation "happy"
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2021-11-17,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 存储内核技术交流 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • pageinspect分析工具
    • 物理也分析
    相关产品与服务
    数据保险箱
    数据保险箱(Cloud Data Coffer Service,CDCS)为您提供更高安全系数的企业核心数据存储服务。您可以通过自定义过期天数的方法删除数据,避免误删带来的损害,还可以将数据跨地域存储,防止一些不可抗因素导致的数据丢失。数据保险箱支持通过控制台、API 等多样化方式快速简单接入,实现海量数据的存储管理。您可以使用数据保险箱对文件数据进行上传、下载,最终实现数据的安全存储和提取。
    领券
    问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档