在数据库查询优化中,直方图(Histogram)通过统计字段值的分布情况,帮助查询优化器更精准地估算查询条件的选择率,从而生成更优的执行计划。尤其对于非索引字段或数据分布倾斜的场景,直方图能以非常低的维护成本显著提升查询性能。
今天给大家通过具体示例展示直方图的创建、应用及效果验证,并提供初始化数据供读者复现实验。
场景1:销售表(sales)
CREATE TABLE sales (
id INT PRIMARY KEY AUTO_INCREMENT,
product_name VARCHAR(50),
price DECIMAL(10,2),
region ENUM('East', 'West', 'North', 'South'),
sale_date DATE
);
-- 插入20万条数据,80%的price在100-200之间
DELIMITER $$
CREATE PROCEDURE GenerateSalesData()
BEGIN
DECLARE i INT DEFAULT 0;
DECLARE batch_size INT DEFAULT 1000; -- 每批次插入1000条
DECLARE max_records INT DEFAULT 200000;
-- 关闭自动提交以提升性能
SET autocommit = 0;
WHILE i < max_records DO
INSERT INTO sales (product_name, price, region, sale_date)
SELECT
CONCAT('Product_', FLOOR(RAND() * 1000)),
CASE
WHEN RAND() < 0.8 THEN 100 + RAND() * 100 -- 80%数据在100-200
ELSE 500 + RAND() * 500 -- 20%数据在500-1000
END,
ELT(FLOOR(1 + RAND() * 4), 'East', 'West', 'North', 'South'),
DATE_SUB(NOW(), INTERVAL FLOOR(RAND() * 365) DAY)
FROM (
-- 生成1000行的临时序列(替代笛卡尔积)
WITH RECURSIVE seq AS (
SELECT 1 AS n
UNION ALL
SELECT n + 1 FROM seq
WHERE n < 1000
)
SELECT n FROM seq
) AS tmp;
SET i = i + batch_size;
COMMIT; -- 分批提交事务
END WHILE;
-- 恢复自动提交
SET autocommit = 1;
END$$
DELIMITER ;
CALL GenerateSalesData(); -- 生成20万条数据
场景2:用户活动表(user_activity)
CREATE TABLE user_activity (
user_id INT PRIMARY KEY,
login_time TIME,
activity_type VARCHAR(20)
);
-- 插入10万条数据,70%的登录时间在09:00-12:00
DELIMITER $$
CREATE PROCEDURE InsertUserActivityData()
BEGIN
DECLARE i INT DEFAULT 1;
WHILE i <= 100000 DO
-- 计算login_time和activity_type
SET @random_value = RAND();
SET @hour = IF(@random_value < 0.7,
FLOOR(9 + (@random_value * 3) * 3),
IF(@random_value < 0.85,
FLOOR(RAND() * 9),
12 + FLOOR(RAND() * 12)
)
);
SET @minute = FLOOR(RAND() * 60);
SET @second = FLOOR(RAND() * 60);
SET @login_time = CONCAT(LPAD(@hour, 2, '0'), ':', LPAD(@minute, 2, '0'), ':', LPAD(@second, 2, '0'));
-- 设置activity_type
SET @activity_type = IF(RAND() > 0.5, 'login', 'other');
-- 插入数据
INSERT INTO user_activity (user_id, login_time, activity_type) VALUES (i, @login_time, @activity_type);
SET i = i + 1;
END WHILE;
END$$
DELIMITER ;
-- 执行存储过程
CALL InsertUserActivityData();
-- 为sales表的price字段创建直方图(64个桶)
ANALYZE TABLE sales UPDATE HISTOGRAM ON price WITH 64 BUCKETS;
ANALYZE TABLE sales UPDATE HISTOGRAM ON price, region
。ANALYZE TABLE sales DROP HISTOGRAM ON price;
information_schema.COLUMN_STATISTICS
视图中,以JSON格式记录桶的分布、频率及数据类型。 SELECT JSON_PRETTY(HISTOGRAM)
FROM information_schema.COLUMN_STATISTICS
WHERE TABLE_NAME = 'sales' AND COLUMN_NAME = 'price';
输出示例(等高直方图):
{
"buckets": [
[
100.02,
102.09,
0.01557782876464195,
98
],
[
102.15,
104.11,
0.03127641589180051,
95
],
[
104.12,
105.85,
0.046975003018959065,
93
],
[
105.86,
107.91,
0.06267359014611762,
95
],
[
107.93,
109.88,
0.07837217727327618,
100
],
[
109.92,
111.64,
0.09407076440043473,
87
],
[
111.69,
113.73,
0.10976935152759329,
98
],
[
113.75,
116.18,
0.12546793865475184,
102
],
[
116.20,
117.94,
0.14104576741939379,
88
],
[
117.96,
119.92,
0.15674435454655236,
98
],
[
119.93,
121.97,
0.1724429416737109,
93
],
[
121.99,
124.26,
0.18814152880086946,
92
],
[
124.27,
126.21,
0.203840115928028,
97
],
[
126.22,
128.05,
0.21953870305518658,
90
],
[
128.07,
129.92,
0.23523729018234513,
96
],
[
129.93,
131.74,
0.2509358773095037,
89
],
[
131.76,
133.65,
0.2666344644366622,
88
],
[
133.66,
135.71,
0.28233305156382077,
99
],
[
135.74,
137.65,
0.2980316386909794,
105
],
[
137.69,
139.46,
0.3137302258181379,
89
],
[
139.48,
141.51,
0.32942881294529647,
97
],
[
141.54,
143.94,
0.3450066417099384,
97
],
[
143.95,
145.91,
0.36058447047458037,
100
],
[
145.94,
148.21,
0.37616229923922234,
99
],
[
148.23,
150.17,
0.3918608863663809,
99
],
[
150.20,
151.83,
0.4074387151310228,
90
],
[
151.84,
153.76,
0.42313730225818136,
98
],
[
153.79,
155.60,
0.43883588938533996,
95
],
[
155.62,
157.30,
0.4545344765124985,
86
],
[
157.32,
159.22,
0.47023306363965706,
96
],
[
159.23,
161.42,
0.4859316507668156,
98
],
[
161.44,
163.29,
0.5016302378939742,
98
],
[
163.32,
165.12,
0.5173288250211328,
96
],
[
165.15,
166.81,
0.5330274121482913,
96
],
[
166.82,
168.64,
0.5486052409129332,
93
],
[
168.65,
170.60,
0.5641830696775751,
93
],
[
170.61,
172.71,
0.5798816568047337,
95
],
[
172.72,
174.66,
0.5954594855693757,
93
],
[
174.67,
176.87,
0.6110373143340176,
92
],
[
176.89,
178.68,
0.6267359014611762,
95
],
[
178.69,
180.63,
0.6424344885883347,
92
],
[
180.65,
182.46,
0.6580123173529767,
95
],
[
182.47,
184.19,
0.6737109044801353,
88
],
[
184.20,
186.04,
0.6894094916072938,
94
],
[
186.05,
188.13,
0.7051080787344524,
96
],
[
188.16,
190.23,
0.7208066658616109,
97
],
[
190.24,
192.17,
0.7365052529887695,
86
],
[
192.21,
194.22,
0.752203840115928,
101
],
[
194.23,
196.21,
0.7679024272430865,
96
],
[
196.24,
198.18,
0.7836010143702451,
100
],
[
198.21,
503.32,
0.7992996014974036,
96
],
[
503.48,
544.50,
0.8148774302620456,
126
],
[
545.27,
583.57,
0.8305760173892042,
128
],
[
583.71,
618.13,
0.8462746045163627,
129
],
[
618.19,
657.66,
0.8619731916435213,
128
],
[
657.88,
697.13,
0.8776717787706799,
128
],
[
697.44,
734.79,
0.8933703658978385,
126
],
[
734.82,
770.25,
0.909068953024997,
129
],
[
770.30,
810.23,
0.9247675401521556,
128
],
[
810.94,
852.52,
0.9404661272793141,
130
],
[
853.20,
894.47,
0.9561647144064727,
128
],
[
894.66,
930.69,
0.9718633015336312,
128
],
[
931.37,
974.87,
0.9875618886607898,
127
],
[
975.28,
999.41,
1.0,
101
]
],
"data-type": "decimal",
"null-values": 0.0,
"collation-id": 8,
"last-updated": "2025-04-22 05:24:10.208599",
"sampling-rate": 1.0,
"histogram-type": "equi-height",
"number-of-buckets-specified": 64
}
等高直方图(Equi-height):每个桶保存值范围及累积频率,适用于连续数据(如时间、金额)。
自动选择机制:MySQL根据数据分布和Bucket数量自动选择类型,用户无法手动干预。
EXPLAIN SELECT * FROM sales WHERE price BETWEEN 150 AND 200;
-- 输出:全表扫描,filtered≈11.11%(假设均匀分布)
ANALYZE TABLE sales UPDATE HISTOGRAM ON price;
EXPLAIN SELECT * FROM sales WHERE price BETWEEN 150 AND 200;
-- 输出:可能使用索引或减少扫描行数,filtered≈39.04%
说明:filtered
表示通过查询条件过滤后,预估满足条件的行数占扫描总行数的百分比。该值范围从 0 到 100,值越大表示过滤效率越高。单表全表扫描(type=ALL
)filtered
越大,表示 WHERE
条件过滤效率越高(更多无效数据被过滤)。
为region
和price
字段同时创建直方图,优化器可联合估算选择率:
EXPLAIN SELECT * FROM sales WHERE price BETWEEN 150 AND 200 AND region = 'East';
SELECT * FROM sales
WHERE price BETWEEN 150 AND 200
AND region = 'East';
region='East'
(假设占25%)和price
范围的条件联合选择率预估为(25% * 80% = 20%)。-- 为region和price字段同时创建直方图
ANALYZE TABLE sales UPDATE HISTOGRAM ON price,region;
EXPLAIN SELECT * FROM sales WHERE price BETWEEN 150 AND 200 AND region = 'East';
histogram_generation_max_mem_size
限制内存并触发采样。 ANALYZE TABLE
。 直方图是MySQL 8.0中一项高效的统计工具,特别适用于数据分布倾斜或无法创建索引的场景。通过合理配置Bucket数量和定期更新统计信息,可显著提升复杂查询的性能。
互动提问:
你在实际工作中是否遇到过因数据分布不均导致的性能问题?尝试过直方图优化吗?欢迎分享你的案例或疑问!
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。