5常见创建TABLE方式
5.1创建Heap表
drop table if exists test_head;
create table test_head(id int primary key) distributed by (id);
distributed by表示制定分布键,便于segment储存数据
5.2创建AO表
5.2.1 AO表不压缩
drop table if exists test_ao;
create table test_ao(id int) with (appendonly=true) distributed by (id);
appendonly=true是表示AO(Append-optimized)存储表的表示,参数为true和false,例如appendonly=true或appendonly=false
5.2.2 AO表压缩
drop table if exists test_ao;
create table test_ao(id int) with (appendonly=true, compresslevel=5) distributed by (id);
compresslevel是压缩率,取值为1~9,一般选择5就足够了,值越高压缩率越高
5.2.3 AO表列存压缩
AO表列存压缩 与上表的压缩方式不同
drop table if exists test_ao;
create table test_ao(id int) with (appendonly=true,compresslevel=5, orientation=column) distributed by (id);
orientation是对列进行压缩,写法只有orientation=column
5.2.3.1对orientation参数进行测试5.2.3.1.1创建表语句
创建不对列压缩的表
CREATE TABLE e_alter_recoder_out_20180810(
s_ext_nodenum varchar ,
pripid varchar ,
s_ext_sequence varchar ,
alt_id varchar ,
altitem varchar ,
altbe varchar,
altaf varchar,
altdate varchar ,
alttime varchar ,
s_ext_timestamp varchar ,
s_ext_batch varchar ,
s_ext_validflag varchar,
jobid varchar,
entid varchar,
handle_type varchar
) WITH (appendonly=true, compresstype=zlib, compresslevel=5)
DISTRIBUTED BY (pripid);
创建对列压缩的表
CREATE TABLE e_alter_recoder_out_20180812(
s_ext_nodenum varchar ,
pripid varchar ,
s_ext_sequence varchar ,
alt_id varchar ,
altitem varchar ,
altbe varchar,
altaf varchar,
altdate varchar ,
alttime varchar ,
s_ext_timestamp varchar ,
s_ext_batch varchar ,
s_ext_validflag varchar,
jobid varchar,
entid varchar,
handle_type varchar
) WITH (appendonly=true, compresstype=zlib, compresslevel=5,orientation=column)
DISTRIBUTED BY (pripid);
在以上可以看出一共有15个字段
5.2.3.1.2查看数据的大小
$ du -sh mv_e_alter_recoder_20180922.csv
48G mv_e_alter_recoder_20180922.csv
5.2.3.1.3使用COPY命令导入数据
$ time psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY e_alter_recoder_out_20180810 FROM '/data/oracle-export-data/DATA20180922/mv_e_alter_recoder_20180922.csv' WITH csv DELIMITER E'\001' LOG ERRORS SEGMENT REJECT LIMIT 3000 ROWS"
Password for user gpadmin:
real 11m49.978s
user 1m17.379s
sys 0m43.668s
time psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY e_alter_recoder_out_20180812 FROM '/data/oracle-export-data/DATA20180922/mv_e_alter_recoder_20180922.csv' WITH csv DELIMITER E'\001' LOG ERRORS SEGMENT REJECT LIMIT 3000 ROWS"
Password for user gpadmin:
real 12m11.227s
user 1m27.575s
sys 0m50.548s
在以上结果中可以看出不对列压缩用时11m49.978s,而对列压缩的用时12m11.227s,相差23S
5.2.3.1.4查看在数据库中占用的大小
select pg_size_pretty(pg_relation_size('e_alter_recoder_out_20180810'));
-- 14 GB
select pg_size_pretty(pg_relation_size('e_alter_recoder_out_20180812'));
-- 11 GB
使用列压缩竟然缩小了3G的空间,好恐怖,23S节省3G空间,值得拥有。
5.2.3.1.5查看表的行数
select count(*) from e_alter_recoder_out_20180810;
select count(*) from e_alter_recoder_out_20180812;
5.2.3.2统计压缩的大小5.2.3.2.1原始表占用大小
查询各个schema占用大小的SQL:
select pg_size_pretty(cast(sum(pg_relation_size( schemaname || '.' || tablename)) as bigint)), schemaname from pg_tables t inner join pg_namespace d on t.schemaname=d.nspname group by schemaname;
schema下有750张占用940GB的空间,原始压缩方式为:WITH (appendonly=true, compresstype=zlib, compresslevel=5)
5.2.3.2.2压缩后表的占用大小
修改压缩方式后:WITH (appendonly = true, compresstype = zlib, compresslevel = 7
,orientation=column, checksum = false,blocksize = 2097152),具体的参数的含义请参考以上的说明。
在以上可以看出已经节省了320GB的空间,又省钱可以买糖吃啦,别拉我,我要去买糖、、、
5.2.3.3压缩类型比较5.2.3.3.1查看压缩前的大小
以下表的参数为:
with ( appendonly = true, compresstype = zlib, compresslevel = 7
,orientation=column, checksum = false,blocksize = 2097152 )
stagging=# select pg_size_pretty(pg_relation_size('xiaoxu.e_alter_recoder_out_20180810_com'));
pg_size_pretty
----------------
11 GB
(1 row)
Time: 33.880 ms
使用compresstype = zlib的压缩方式的大小为11 GB,修改compresstype的参数查看大小,其中参数为:zlib,quicklz,rle_type
5.2.3.3.2使用quicklz压缩
stagging=# create table xiaoxu.e_alter_recoder_out_20180810_com_quicklz with (appendonly = true, compresstype =quicklz, compresslevel = 1 ,orientation=column, checksum = false,blocksize = 2097152) as select * from xiaoxu.e_alter_recoder_out_20180810_com Distributed by (pripid);
Time: 59158.322 ms
stagging=# select pg_size_pretty(pg_relation_size('xiaoxu.e_alter_recoder_out_20180810_com_quicklz'));
pg_size_pretty
----------------
14 GB
(1 row)
Time: 34.632 ms
在以上看出使用quicklz压缩直接蹦到14GB了,不可忍受
5.2.3.3.3使用rle_type压缩
stagging=# create table xiaoxu.e_alter_recoder_out_20180810_com_rle_type with (appendonly = true, compresstype = rle_type, compresslevel = 1 ,orientation=column, checksum = false,blocksize = 2097152) as select * from xiaoxu.e_alter_recoder_out_20180810_com Distributed by (pripid);
Time: 124042.881 ms
stagging=# select pg_size_pretty(pg_relation_size('xiaoxu.e_alter_recoder_out_20180810_com_rle_type'));
pg_size_pretty
----------------
38 GB
(1 row)
Time: 10.006 ms
这个更厉害,已经飙到38GB了,吓得我赶紧删除了,爬服务器冒烟、、
5.3创建HDFS外表实例
5.3.1创建外部表实例
CREATE EXTERNAL TABLE e_alter_recoder_out_20180812(
s_ext_nodenum varchar ,
pripid varchar ,
s_ext_sequence varchar ,
alt_id varchar ,
altitem varchar ,
altbe varchar,
altaf varchar,
altdate varchar ,
alttime varchar ,
s_ext_timestamp varchar ,
s_ext_batch varchar ,
s_ext_validflag varchar,
jobid varchar,
entid varchar,
handle_type varchar )
LOCATION ('gphdfs://nameservice1/tmp/e_alter_recoder_20180812/E_ALTER_RECODER/*') format 'text' (delimiter E'\u0001' FILL MISSING FIELDS) NULL as 'null string' ESCAPE as 'OFF' LOG ERRORS SEGMENT REJECT LIMIT 3000 ROWS;
EXTERNAL外表需要添加关键字
nameservice1是HDFS的HA的地址,需要先配置好,或者写成192.168.209.105:8090
tmp/e_alter_recoder_20180812/E_ALTER_RECODER/是HDFS上的路径
delimiter分隔符是E'\u0001',也就是隐藏符SOH
LOG ERRORS SEGMENT REJECT说明吧错误数据放到GP默认的gp_read_error_log中
LIMIT 3000 ROWS表示允许错误的最大的错误数,可以调大也可以调小,最小为2
5.3.2查看错误数据的实例
SELECT gp_read_error_log('tableName');
错误表字段解释:
Column | Type | Modifiers
----------+--------------------------+-----------
cmdtime | timestamp with time zone | --操作时间
relname | text | --表名
filename | text | --文件名
linenum | integer | --错误行号
bytenum | integer |
errmsg | text | --错误信息
rawdata | text | --整行数据
rawbytes | bytea | --行大小
Distributed randomly
详细的说明请查看:Greenplum加载数据常见错误及解决方法
5.3.3创建可写外部表实例
5.3.3.1创建可写外部表实例
CREATEWRITABLEEXTERNAL TABLE table_name
( column_name data_type [, ...] | LIKE other_talbe )
LOCATION ('gpfdist://outputhost[:port]/fillename'[, ...]) | ('gpfdist://hdfs_host[:port]/path')
FORMAT 'TEXT'
[( [DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF'] )]
| 'CSV'
[([QUETE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE QUOTE column [, ...]]
[ESCAPE [AS] 'escape'] )]
[ ENCODING 'write_encoding' ]
[DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY]
5.3.3.2创建可写外部表
stagging=# dropexternaltable xiaoxu.hdfs_writable_test;
DROP EXTERNAL TABLE
Time: 85.875 ms
stagging=# CREATEWRITABLEEXTERNAL TABLE xiaoxu.hdfs_writable_test (LIKE xiaoxu.e_alter_recoder_out_20180810
) LOCATION ('gphdfs://nameservice1/tmp/xiaoxu/hdfs_writable_test'
) format 'text' (DELIMITER E'\u0001' NULL as 'null string' ESCAPE as 'OFF'
) Distributed by (pripid);
Time: 55.628 ms
stagging=# insert into xiaoxu.hdfs_writable_test select * from xiaoxu.e_alter_recoder_out_20180810 limit 10;
INSERT 0 10
Time: 1582.187 ms
xiaoxu.hdfs_writable_test :表明
tmp/xiaoxu/hdfs_writable_test : HDFS上路径的名字
在以上可以看出只需要WRITABLE关键字既可以实现外部表
5.3.3.3查看HDFS上的数据
在以上中可以看出HDFS已经正确的分割了数据
5.4创建分区表
5.4.1分区表概念
分区表意思是将一个大表在物理上分割成几块,GPDB中的分区表和PostgreSQL中实现原理一样,都是用过表继承、约束来实现。但是与PostgreSQL也有所不同,在PostgreSQL中,一个父表,多个子表来实现分区表,需要手动向子表插入数据,如果向父表插入数据,则直接会被插入到父表中,在GPDB中,可以直接想父表插入数据,便可以根据约束直接自动向对应的子表插入数据,当分区子表不存在时,插入失败。
5.4.2创建分区表
drop table if exists t05;
create table t05(id int, createOn date) with (appendonly=true, compresslevel=5,orientation=column) distributed by (id)
Partition By Range(createOn)(
partition p2016 start ('2016/01/01'::date) inclusive end ('2016/12/31'::date) exclusive,
partition p2017 start ('2017/01/01'::date) inclusive end ('2017/12/31'::date) exclusive,
partition p2018 start ('2018/01/01'::date) inclusive end ('2018/12/31'::date) exclusive
-- default partition otherTime
);
Range分区表的关键字
p2016分区表的名称
start开始分区表的开始时间
end分区表的结束时间
default partition otherTime也可以使用默认的分区
inclusive:指定包含,例如上面的start ('2016/01/01') inclusive则是包含'2016/01/01'
exclusive:指定不包含, 例如上面的end ('2016/12/31') exclusive则是不包含'2016/12/31'
5.4.3查看创建的分区表
5.4.3.1查看创建表的名称
可以看出GP已经把分区的前缀给设置好了,t05_1_prt_p2016,p2016即使当前分区的名字
5.4.3.2查看表的结构
stagging=# \d+ t05_1_prt_p2016
Append-Only Columnar Table "public.t05_1_prt_p2016"
Column | Type | Modifiers | Storage | Compression Type | Compression Level | Block Size | Description
----------+---------+-----------+---------+------------------+-------------------+------------+-------------
id | integer | | plain | zlib | 5 | 32768 |
createon | date | | plain | zlib | 5 | 32768 |
Checksum: t
Check constraints:
"t05_1_prt_p2016_check" CHECK (createon >= '2016-01-01'::date AND createon
Inherits: t05
Has OIDs: no
Options: appendonly=true, compresslevel=5, orientation=column
Distributed by: (id)
在以上的信息中可以看出分区表其实就是主表管理者分区表的一个约束范围,partition p2016 start ('2016/01/01'::date) end ('2016/12/31'::date)
写法表示createon >= '2016-01-01'::date AND createon 5.4.4插入分区表数据5.4.4.1生成数据脚本
$ cat date_partition.sh
#!bin/bash
# $1是开始时间
# $2是结束时间
# $3生成数据保存的文件
begin_date=$1
end_date=$2
while [ "$begin_date" != "$end_date" ]
do
echo $begin_date"|"$begin_date >> $3
let begin_date=`date -d "-1 days ago $" +%Y%m%d`
done
5.4.4.2插入数据
在插入数据时只需要向主表中插入数据即可
$ psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY t05 FROM '/home/*******/partition_data/20161231.csv' WITH csv DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 5000 ROWS"
如果插入一个没有分布表的会报一下错误信息
$ psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY t05 FROM '/home/******/partition_data/20191231.csv' WITH csv DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 5000 ROWS"
NOTICE: Found 364 data formatting errors (364 or more input rows). Rejected related input data
5.4.5查看数据及数据分布情况
5.4.5.1查看主表的数据
select * from t05 where id BETWEEN '20160101' AND '20181231'
5.4.5.2查看分布表的数据
select * from t05_1_prt_p2016;
5.4.5.3查看数据分布情况
查看分区表的分布情况
select gp_segment_id,count(1) from t05_1_prt_p2016 group by 1;
查看主表的分布情况
select gp_segment_id,count(1) from t05 group by 1;
5.4.6对分区表常用操作
5.4.6.1增加分区
ALTER TABLE t05 ADD PARTITION p2019
START (date '2019/01/01') END (date '2019/12/31') EXCLUSIVE;
5.4.6.2删掉分区
alter table t05 drop partition p2016;
5.4.6.3清空分区
相当于DROP掉又重新创建的分区
alter table t05 truncate partition p2016;
5.4.6.4修改分区
alter table t05 split partition p2019 at ('2019-06-30') into (partition p2019, partition p2020);
5.4.6.5修改default分区
alter table t05 split default partition start ('2017-03-01') end ('2017-03-31') into (partition p4, default partition);
5.4.7创建数字范围的分区表
drop table if exists t06;
create table t06(id int,dayOfPeriod int) distributed by (id)
partition by range(dayOfPeriod)
(
start (1) end (31) every(1),
default partition none
);
every表示每次增加一,可以拿此作为主键
5.4.8创建字母范围的分区表
drop table if exists t07;
create table t07(id int , gender char(2)) with(appendonly=true,compresslevel=5) distributed by (id)
partition by list(gender)
(
partition man values('m'),
partition woman values('f'),
default partition Unkown
);
5.5快速复制表
CREATE TABLE e_alter_recoder_out_20180814 WITH (
appendonly = TRUE,
compresstype = zlib,
compresslevel = 5,
orientation = column
) AS SELECT
*
FROM
e_alter_recoder_out_20180812 Distributed BY (pripid)
查看执行的执行的时间
**************
FROM
e_alter_recoder_out_20180812 Distributed BY (pripid)
时间: 69.977s
受影响的行: 1,5678,4862
可以看出14 GB的文件用时69.977s导入1,5678,4862行的数据
领取专属 10元无门槛券
私享最新 技术干货