(五)进阶技术 14. 维度合并 随着数据仓库中维度的增加,会发现有些通用的数据存在于多个维度中。例如,客户维度的客户邮编相关信息、送货邮编相关信息和工厂维度里都有邮编、城市和州。本篇说明如何把三个维度里的邮编相关信息合并到一个新的邮编维度。 修改数据仓库模式 为了合并维度,需要改变数据仓库模式。图(五)- 14-1显示了修改后的模式。新增了一个zip_code_dim表,sales_order_fact和production_fact表的结构也做了相应的修改。注意图中只显示了与邮编维度相关的表。
图(五)- 14-1 zip_code_dim表与两个事实表相关联。这些关系替换了这两个事实表与客户维度、工厂维度的关系。sales_order_fact表需要两个关系,一个关联到客户邮编,另一个关联到送货邮编。与production_fact表只有一个关系,所以在这个事实表里只增加了工厂邮编代理键。 清单(五)-14-1里的脚本用于修改数据仓库模式。所做的修改如下。
USE dw;
-- 建立地址维度表
CREATE TABLE zip_code_dim (
zip_code_sk INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
zip_code INT(5),
city VARCHAR(30),
state VARCHAR(2),
version INT DEFAULT 1,
effective_date DATE DEFAULT '1900-01-01',
expiry_date DATE DEFAULT '2200-01-01'
);
-- 初始装载邮编相关数据
insert into zip_code_dim (zip_code,city,state) values (17050,'PITTSBURGH','PA');
insert into zip_code_dim (zip_code,city,state) values (17051,'MC VEYTOWN','PA');
insert into zip_code_dim (zip_code,city,state) values (17052,'MAPLETON DEPOT','PA');
insert into zip_code_dim (zip_code,city,state) values (17053,'MARYSVILLE','PA');
insert into zip_code_dim (zip_code,city,state) values (17054,'MATTAWANA','PA');
insert into zip_code_dim (zip_code,city,state) values (17055,'MECHANICSBURG','PA');
insert into zip_code_dim (zip_code,city,state) values (44102,'CLEVELAND','OH');
COMMIT;
-- 创建视图
CREATE VIEW customer_zip_code_dim (customer_zip_code_sk , customer_zip_code , customer_city , customer_state , version , effective_date , expiry_date) AS
SELECT
zip_code_sk,
zip_code,
city,
state,
version,
effective_date,
expiry_date
FROM
zip_code_dim;
CREATE VIEW shipping_zip_code_dim (shipping_zip_code_sk , shipping_zip_code , shipping_city , shipping_state , version , effective_date , expiry_date) AS
SELECT
zip_code_sk,
zip_code,
city,
state,
version,
effective_date,
expiry_date
FROM
zip_code_dim;
-- 添加邮编代理键
ALTER TABLE sales_order_fact
ADD customer_zip_code_sk INT AFTER customer_sk
, ADD shipping_zip_code_sk INT AFTER customer_zip_code_sk;
-- 添加外键约束
ALTER TABLE sales_order_fact
ADD FOREIGN KEY (customer_zip_code_sk) REFERENCES zip_code_dim(zip_code_sk),
ADD FOREIGN KEY (shipping_zip_code_sk) REFERENCES zip_code_dim(zip_code_sk);
-- 初始装载两个邮编代理键
UPDATE sales_order_fact a,
customer_dim b,
customer_zip_code_dim c
SET
a.customer_zip_code_sk = c.customer_zip_code_sk
WHERE
a.customer_sk = b.customer_sk
AND b.customer_zip_code = c.customer_zip_code;
UPDATE sales_order_fact a,
customer_dim b,
shipping_zip_code_dim c
SET
a.shipping_zip_code_sk = c.shipping_zip_code_sk
WHERE
a.customer_sk = b.customer_sk
AND b.shipping_zip_code = c.shipping_zip_code;
COMMIT;
ALTER TABLE customer_dim
DROP customer_zip_code
, DROP customer_city
, DROP customer_state
, DROP shipping_zip_code
, DROP shipping_city
, DROP shipping_state;
ALTER TABLE pa_customer_dim
DROP customer_zip_code
, DROP customer_city
, DROP customer_state
, DROP shipping_zip_code
, DROP shipping_city
, DROP shipping_state;
-- 创建视图
CREATE VIEW factory_zip_code_dim (factory_zip_code_sk , factory_zip_code , factory_city , factory_state , version,effective_date , expiry_date) AS
SELECT
zip_code_sk,
zip_code,
city,
state,
version,
effective_date,
expiry_date
FROM
zip_code_dim;
-- 给production_fact表增加factory_zip_code_sk列
ALTER TABLE production_fact
ADD factory_zip_code_sk INT AFTER factory_sk;
-- 添加外键约束
ALTER TABLE production_fact ADD FOREIGN KEY (factory_zip_code_sk) REFERENCES zip_code_dim(zip_code_sk);
-- 初始装载邮编代理键
UPDATE production_fact a,
factory_dim b,
factory_zip_code_dim c
SET
a.factory_zip_code_sk = c.factory_zip_code_sk
WHERE
a.factory_sk = b.factory_sk
AND b.factory_zip_code = c.factory_zip_code;
COMMIT;
-- 定义factory_code作为factory_stg表的主键,并把factory_dim表里的工厂信息导入factory_stg。为产品的定期导入,过渡表里需要有所有工厂的完整数据(包括邮编、城市和州)。需要主键来维护factory_stg表里的工厂数据。
TRUNCATE factory_stg;
ALTER TABLE factory_stg
ADD PRIMARY KEY (factory_code);
INSERT INTO factory_stg
SELECT
factory_code
, factory_name
, factory_street_address
, factory_zip_code
, factory_city
, factory_state
FROM factory_dim;
COMMIT ;
-- 在factory_dim表上删除工厂编码及其它们的城市和州列
ALTER TABLE factory_dim
DROP factory_zip_code
, DROP factory_city
, DROP factory_state;
清单(五)-14-1
执行完清单(五)-14-1里的脚本后,可以查询customer_zip_code_dim、shipping_code_dim、factory_zip_code_dim维度表和sales_order_fact、production_fact事实表,确认邮编已经被成功分离。查询语句和结果如下所示。 mysql> select -> customer_zip_code_sk sk, -> customer_zip_code zip, -> customer_city city, -> customer_state state -> from -> customer_zip_code_dim; +----+-------+----------------+-------+ | sk | zip | city | state | +----+-------+----------------+-------+ | 1 | 17050 | PITTSBURGH | PA | | 2 | 17051 | MC VEYTOWN | PA | | 3 | 17052 | MAPLETON DEPOT | PA | | 4 | 17053 | MARYSVILLE | PA | | 5 | 17054 | MATTAWANA | PA | | 6 | 17055 | MECHANICSBURG | PA | | 7 | 44102 | CLEVELAND | OH | +----+-------+----------------+-------+ 7 rows in set (0.00 sec) mysql> mysql> select -> shipping_zip_code_sk sk, -> shipping_zip_code zip, -> shipping_city city, -> shipping_state sta -> from -> shipping_zip_code_dim; +----+-------+----------------+------+ | sk | zip | city | sta | +----+-------+----------------+------+ | 1 | 17050 | PITTSBURGH | PA | | 2 | 17051 | MC VEYTOWN | PA | | 3 | 17052 | MAPLETON DEPOT | PA | | 4 | 17053 | MARYSVILLE | PA | | 5 | 17054 | MATTAWANA | PA | | 6 | 17055 | MECHANICSBURG | PA | | 7 | 44102 | CLEVELAND | OH | +----+-------+----------------+------+ 7 rows in set (0.00 sec) mysql> mysql> select -> factory_zip_code_sk sk, -> factory_zip_code zip, -> factory_city city, -> factory_state state -> from -> factory_zip_code_dim; +----+-------+----------------+-------+ | sk | zip | city | state | +----+-------+----------------+-------+ | 1 | 17050 | PITTSBURGH | PA | | 2 | 17051 | MC VEYTOWN | PA | | 3 | 17052 | MAPLETON DEPOT | PA | | 4 | 17053 | MARYSVILLE | PA | | 5 | 17054 | MATTAWANA | PA | | 6 | 17055 | MECHANICSBURG | PA | | 7 | 44102 | CLEVELAND | OH | +----+-------+----------------+-------+ 7 rows in set (0.00 sec) mysql> mysql> select -> order_date_sk odsk, -> customer_sk csk, -> customer_zip_code_sk czsk, -> shipping_zip_code_sk szsk -> from -> sales_order_fact -> order by order_date_sk; +------+------+------+------+ | odsk | csk | czsk | szsk | +------+------+------+------+ | 4809 | 3 | 6 | NULL | | 4854 | 4 | 1 | NULL | | 4889 | 5 | 1 | NULL | | 4960 | 6 | 6 | NULL | | 4993 | 7 | 1 | NULL | | 5063 | 1 | 1 | NULL | | 5119 | 2 | 6 | NULL | | 5155 | 3 | 6 | NULL | | 5188 | 4 | 1 | NULL | | 5224 | 5 | 1 | NULL | | 5264 | 6 | 6 | NULL | | 5266 | 7 | 1 | NULL | | 5310 | 1 | 1 | NULL | | 5356 | 2 | 6 | NULL | | 5362 | 3 | 6 | NULL | | 5392 | 4 | 1 | NULL | | 5489 | 5 | 1 | NULL | | 5530 | 6 | 6 | NULL | | 5535 | 6 | 6 | NULL | | 5536 | 6 | 6 | NULL | | 5538 | 7 | 1 | NULL | | 5539 | 3 | 6 | NULL | | 5539 | 4 | 1 | NULL | | 5539 | 5 | 1 | NULL | | 5539 | 8 | 6 | NULL | | 5539 | 7 | 1 | NULL | | 5539 | 9 | 6 | NULL | | 5539 | 2 | 6 | NULL | | 5539 | 1 | 1 | NULL | | 5539 | 1 | 1 | NULL | | 5539 | 2 | 6 | NULL | | 5539 | 3 | 6 | NULL | | 5539 | 4 | 1 | NULL | | 5539 | 5 | 1 | NULL | | 5539 | 8 | 6 | NULL | | 5539 | 9 | 6 | NULL | | 5539 | 7 | 1 | NULL | | 5540 | 18 | 6 | 6 | | 5540 | 17 | 6 | 6 | | 5540 | 16 | 1 | 1 | | 5540 | 15 | 6 | 6 | | 5540 | 14 | 1 | 1 | | 5540 | 13 | 1 | 1 | | 5540 | 11 | 6 | 6 | | 5540 | 10 | 1 | 1 | | 5540 | 12 | 6 | 6 | | 5542 | 12 | 6 | 6 | | 5542 | 11 | 6 | 6 | | 5542 | 10 | 1 | 1 | | 5543 | 10 | 1 | 1 | | 5543 | 11 | 6 | 6 | | 5549 | 11 | 6 | 6 | | 5549 | 10 | 1 | 1 | | 5554 | 23 | 7 | 7 | | 5554 | 22 | 1 | 1 | | 5554 | 21 | 7 | 7 | | 5554 | 20 | 7 | 7 | | 5554 | 12 | 6 | 6 | | 5554 | 11 | 6 | 6 | | 5554 | 10 | 1 | 1 | | 5554 | 13 | 1 | 1 | | 5564 | 21 | 7 | 7 | +------+------+------+------+ 62 rows in set (0.00 sec) mysql> mysql> select -> product_sk psk, -> production_date_sk pdsk, -> factory_sk fsk, -> factory_zip_code_sk fzsk, -> production_quantity qty -> from -> production_fact; +------+------+------+------+------+ | psk | pdsk | fsk | fzsk | qty | +------+------+------+------+------+ | 1 | 5556 | 4 | 6 | 100 | | 2 | 5556 | 3 | 1 | 200 | | 4 | 5556 | 2 | 6 | 300 | | 5 | 5556 | 1 | 1 | 400 | | 1 | 5556 | 1 | 1 | 400 | | 2 | 5556 | 2 | 6 | 300 | | 4 | 5556 | 3 | 1 | 200 | | 5 | 5556 | 4 | 6 | 100 | +------+------+------+------+------+ 8 rows in set (0.00 sec) 注意 早先的销售订单没有送货邮编(NULL),因为在(五)进阶技术1. “增加列”中才添加的送货邮编信息。 修改定期装载脚本 定期装载有三个地方的修改:
清单(五)-14-2里的脚本实现这三个修改。
USE dw;
-- 设置SCD的截止时间和生效时间
SET @pre_date = SUBDATE(CURRENT_DATE,1) ;
-- 设置CDC的上限时间
UPDATE cdc_time SET current_load = CURRENT_DATE ;
-- 装载客户维度
TRUNCATE TABLE customer_stg;
INSERT INTO customer_stg
SELECT
customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
FROM source.customer ;
/* 在所有地址列上 SCD2 */
/* 置过期 */
UPDATE customer_dim a,
customer_stg b
SET
expiry_date = @pre_date
WHERE
a.customer_number = b.customer_number
AND (a.customer_street_address <> b.customer_street_address
OR a.shipping_address <> b.shipping_address
OR a.shipping_address IS NULL)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO customer_dim
SELECT
NULL
, b.customer_number
, b.customer_name
, b.customer_street_address
, b.shipping_address
, a.version + 1
, @pre_date
, '2200-01-01'
FROM
customer_dim a
, customer_stg b
WHERE
a.customer_number = b.customer_number
AND ( a.customer_street_address <> b.customer_street_address
OR a.shipping_address <> b.shipping_address
OR a.shipping_address IS NULL)
AND EXISTS(
SELECT *
FROM customer_dim x
WHERE
b.customer_number=x.customer_number
AND a.expiry_date = @pre_date )
AND NOT EXISTS (
SELECT *
FROM customer_dim y
WHERE
b.customer_number = y.customer_number
AND y.expiry_date = '2200-01-01') ;
/* 在 customer_name 列上 SCD1 */
UPDATE customer_dim a, customer_stg b
SET a.customer_name = b.customer_name
WHERE a.customer_number = b.customer_number
AND a.customer_name <> b.customer_name ;
/* 新增的客户 */
INSERT INTO customer_dim
SELECT
NULL
, customer_number
, customer_name
, customer_street_address
, shipping_address
, 1
, @pre_date
,'2200-01-01'
FROM customer_stg
WHERE customer_number NOT IN(
SELECT y.customer_number
FROM customer_dim x, customer_stg y
WHERE x.customer_number = y.customer_number) ;
/* 装载产品维度 */
TRUNCATE TABLE product_stg ;
INSERT INTO product_stg
SELECT
product_code
, product_name
, product_category
FROM source.product ;
/* 在 product_name 和 product_category 列上 SCD2 */
/* 置过期 */
UPDATE
product_dim a
, product_stg b
SET
expiry_date = @pre_date
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO product_dim
SELECT
NULL
, b.product_code
, b.product_name
, b.product_category
, a.version + 1
, @pre_date
,'2200-01-01'
FROM
product_dim a
, product_stg b
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND EXISTS(
SELECT *
FROM product_dim x
WHERE b.product_code = x.product_code
AND a.expiry_date = @pre_date)
AND NOT EXISTS (
SELECT *
FROM product_dim y
WHERE b.product_code = y.product_code
AND y.expiry_date = '2200-01-01') ;
/* 新增的产品 */
INSERT INTO product_dim
SELECT
NULL
, product_code
, product_name
, product_category
, 1
, @pre_date
, '2200-01-01'
FROM product_stg
WHERE product_code NOT IN(
SELECT y.product_code
FROM product_dim x, product_stg y
WHERE x.product_code = y.product_code) ;
/* PRODUCT_COUNT_FACT POPULATION */
TRUNCATE product_count_fact;
INSERT INTO product_count_fact(product_sk, product_launch_date_sk)
SELECT
a.product_sk
, b.date_sk
FROM
product_dim a
, date_dim b
WHERE
a.effective_date = b.date
GROUP BY product_code;
/* END OF PRODUCT_COUNT_FACT POPULATION */
-- 装载事实表,新增前一天的订单
INSERT INTO sales_order_fact
SELECT
customer_sk
, i.customer_zip_code_sk
, j.shipping_zip_code_sk
, product_sk
, g.sales_order_attribute_sk
, e.order_date_sk
, NULL
, NULL
, NULL
, NULL
, h.entry_date_sk
, a.order_number
, f.request_delivery_date_sk
, order_amount
, quantity
, NULL
, NULL
, NULL
, NULL
FROM
source.sales_order a
, customer_dim c
, product_dim d
, order_date_dim e
, request_delivery_date_dim f
, sales_order_attribute_dim g
, entry_date_dim h
, customer_zip_code_dim i
, shipping_zip_code_dim j
, customer_stg k
, cdc_time l
WHERE
a.order_status = 'N'
AND a.customer_number = c.customer_number
AND a.status_date >= c.effective_date
AND a.status_date < c.expiry_date
AND a.customer_number = k.customer_number
AND k.customer_zip_code = i.customer_zip_code
AND a.status_date >= i.effective_date
AND a.status_date <= i.expiry_date
AND k.shipping_zip_code = j.shipping_zip_code
AND a.status_date >= j.effective_date
AND a.status_date <= j.expiry_date
AND a.product_code = d.product_code
AND a.status_date >= d.effective_date
AND a.status_date < d.expiry_date
AND a.status_date = e.order_date
AND a.entry_date = h.entry_date
AND a.request_delivery_date = f.request_delivery_date
AND a.verification_ind = g.verification_ind
AND a.credit_check_flag = g.credit_check_flag
AND a.new_customer_ind = g.new_customer_ind
AND a.web_order_flag = g.web_order_flag
AND a.status_date >= g.effective_date
AND a.status_date <= g.expiry_date
AND a.entry_date >= l.last_load AND a.entry_date < l.current_load ;
/* RE-BUILD PA CUSTOMER DIMENSION*/
TRUNCATE pa_customer_dim;
INSERT INTO pa_customer_dim
SELECT DISTINCT a.*
FROM
customer_dim a
, sales_order_fact b
, customer_zip_code_dim c
WHERE
c.customer_state = 'PA'
AND b.customer_zip_code_sk = c.customer_zip_code_sk
AND a.customer_sk = b.customer_sk;
/* UPDATING the new sales order to Allocated status */
UPDATE sales_order_fact a,
source.sales_order b,
allocate_date_dim c,
cdc_time h
SET
a.allocate_date_sk = c.allocate_date_sk,
a.allocate_quantity = b.quantity
WHERE
order_status = 'A'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND c.allocate_date = b.status_date ;
/* UPDATING the allocated order to Packed status */
UPDATE sales_order_fact a,
source.sales_order b,
packing_date_dim d,
cdc_time h
SET
a.packing_date_sk = d.packing_date_sk,
a.packing_quantity = b.quantity
WHERE
order_status = 'P'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND d.packing_date = b.status_date ;
/* UPDATING the packed order to Shipped status */
UPDATE sales_order_fact a,
source.sales_order b,
ship_date_dim e,
cdc_time h
SET
a.ship_date_sk = e.ship_date_sk,
a.ship_quantity = b.quantity
WHERE
order_status = 'S'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND e.ship_date = b.status_date ;
/* UPDATING the shipped order to Received status */
UPDATE sales_order_fact a,
source.sales_order b,
receive_date_dim f,
cdc_time h
SET
a.receive_date_sk = f.receive_date_sk,
a.receive_quantity = b.quantity
WHERE
order_status = 'R'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND f.receive_date = b.status_date ;
-- 更新时间戳表的last_load字段
UPDATE cdc_time SET last_load = current_load ;
COMMIT ;
清单(五)-14-2
图(五)- 14-2到图(五)- 14-16是Kettle定期装载作业的相应修改。
图(五)- 14-2
图(五)- 14-3
图(五)- 14-4
图(五)- 14-5
图(五)- 14-6
图(五)- 14-7
图(五)- 14-8
图(五)- 14-9
图(五)- 14-10
图(五)- 14-11
图(五)- 14-12
图(五)- 14-13
图(五)- 14-14
图(五)- 14-15
图(五)- 14-16
测试修改后的定期装载
执行修改后的定期装载脚本或相应的Kettle作业前,需要做一些准备工作。首先对源数据的客户信息做以下两处修改:
使用下面的语句进行修改: update source.customer set customer_street_address = '9999 Louise Dr.', customer_zip_code = 17055, customer_city = 'Pittsburgh', shipping_address = '9999 Louise Dr.', shipping_zip_code = 17055, shipping_city = 'Pittsburgh' where customer_number = 4; insert into source.customer values(15, 'Super Stores', '1000 Woodland St.', 17055, 'Pittsburgh', 'PA', '1000 Woodland St.', 17055, 'Pittsburgh', 'PA'); COMMIT; 现在在装载新的客户数据前查询最后的客户和送货邮编。后面可以用改变后的信息和此查询的输出作对比。查询语句和结果如下所示。 mysql> SELECT -> order_date_sk odsk, -> customer_number cn, -> customer_zip_code czc, -> shipping_zip_code szc -> FROM -> customer_zip_code_dim a, -> shipping_zip_code_dim b, -> sales_order_fact c, -> customer_dim d -> WHERE -> a.customer_zip_code_sk = c.customer_zip_code_sk -> AND b.shipping_zip_code_sk = c.shipping_zip_code_sk -> AND d.customer_sk = c.customer_sk -> GROUP BY customer_number; +------+------+-------+-------+ | odsk | cn | czc | szc | +------+------+-------+-------+ | 5540 | 1 | 17050 | 17050 | | 5540 | 2 | 17055 | 17055 | | 5540 | 3 | 17055 | 17055 | | 5540 | 4 | 17050 | 17050 | | 5540 | 5 | 17050 | 17050 | | 5540 | 6 | 17055 | 17055 | | 5540 | 7 | 17050 | 17050 | | 5540 | 8 | 17055 | 17055 | | 5540 | 9 | 17055 | 17055 | | 5554 | 11 | 44102 | 44102 | | 5554 | 12 | 44102 | 44102 | | 5554 | 13 | 17050 | 17050 | | 5554 | 14 | 44102 | 44102 | +------+------+-------+-------+ 13 rows in set (0.01 sec) 然后使用下面的语句新增两条销售订单。 INSERT INTO source.sales_order VALUES (65, 4, 3, 'Y', 'Y', 'Y', 'N', '2015-03-27', 'N', '2015-03-31', '2015-03-27', 10000, 100) , (66, 15, 4, 'Y', 'N', 'Y', 'N', '2015-03-27', 'N', '2015-03-31', '2015-03-27', 20000, 200); commit; 现在已经做好了测试修改后定期装载的准备。需要设置你的系统日期为2015年3月28日并执行清单(五)-14-2里的脚本或相应的Kettle作业。 查询customer_dim表,确认两个改变的客户,即编号4和15的客户,已经正确装载。查询语句和结果如下所示。 mysql> select * from customer_dim where customer_number in (4, 15)\G *************************** 1. row *************************** customer_sk: 4 customer_number: 4 customer_name: Good Companies customer_street_address: 9500 Scott St. shipping_address: NULL version: 1 effective_date: 2013-03-01 expiry_date: 2015-03-02 *************************** 2. row *************************** customer_sk: 13 customer_number: 4 customer_name: Good Companies customer_street_address: 9500 Scott St. shipping_address: 9500 Scott St. version: 2 effective_date: 2015-03-02 expiry_date: 2015-03-27 *************************** 3. row *************************** customer_sk: 24 customer_number: 4 customer_name: Good Companies customer_street_address: 9999 Louise Dr. shipping_address: 9999 Louise Dr. version: 3 effective_date: 2015-03-27 expiry_date: 2200-01-01 *************************** 4. row *************************** customer_sk: 25 customer_number: 15 customer_name: Super Stores customer_street_address: 1000 Woodland St. shipping_address: 1000 Woodland St. version: 1 effective_date: 2015-03-27 expiry_date: 2200-01-01 4 rows in set (0.00 sec) 查询sales_order_fact表里的两条新销售订单,确认邮编已经正确装载。查询语句和结果如下所示。 mysql> select * from sales_order_fact where order_number IN (65, 66)\G *************************** 1. row *************************** customer_sk: 24 customer_zip_code_sk: 6 shipping_zip_code_sk: 6 product_sk: 4 sales_order_attribute_sk: 3 order_date_sk: 5565 allocate_date_sk: NULL packing_date_sk: NULL ship_date_sk: NULL receive_date_sk: NULL entry_date_sk: 5565 order_number: 65 request_delivery_date_sk: 5569 order_amount: 10000.00 order_quantity: 100 allocate_quantity: NULL packing_quantity: NULL ship_quantity: NULL receive_quantity: NULL *************************** 2. row *************************** customer_sk: 25 customer_zip_code_sk: 6 shipping_zip_code_sk: 6 product_sk: 5 sales_order_attribute_sk: 5 order_date_sk: 5565 allocate_date_sk: NULL packing_date_sk: NULL ship_date_sk: NULL receive_date_sk: NULL entry_date_sk: 5565 order_number: 66 request_delivery_date_sk: 5569 order_amount: 20000.00 order_quantity: 200 allocate_quantity: NULL packing_quantity: NULL ship_quantity: NULL receive_quantity: NULL 2 rows in set (0.00 sec) 注意 输出确认了sales_order_fact表的正确导入。zip_code_sk为6是Mechanicsburg,是客户和送货的正确邮编。
查询pa_customer_dim表,确认PA客户正确装载。查询语句和结果如下所示。
mysql> select * from pa_customer_dim; +-------------+-----------------+------------------------+-------------------------+---------------------+---------+----------------+-------------+ | customer_sk | customer_number | customer_name | customer_street_address | shipping_address | version | effective_date | expiry_date | +-------------+-----------------+------------------------+-------------------------+---------------------+---------+----------------+-------------+ | 1 | 1 | Really Large Customers | 7500 Louise Dr. | NULL | 1 | 2013-03-01 | 2015-03-02 | | 2 | 2 | Small Stores | 2500 Woodland St. | NULL | 1 | 2013-03-01 | 2015-03-02 | | 3 | 3 | Medium Retailers | 1111 Ritter Rd. | NULL | 1 | 2013-03-01 | 2015-03-02 | | 4 | 4 | Good Companies | 9500 Scott St. | NULL | 1 | 2013-03-01 | 2015-03-02 | | 5 | 5 | Wonderful Shops | 3333 Rossmoyne Rd. | NULL | 1 | 2013-03-01 | 2015-03-02 | | 6 | 6 | Loyal Clients | 7070 Ritter Rd. | NULL | 1 | 2013-03-01 | 2015-03-01 | | 7 | 7 | Distinguished Agencies | 9999 Scott St. | NULL | 1 | 2013-03-01 | 2015-03-02 | | 8 | 6 | Loyal Clients | 7777 Ritter Rd. | NULL | 2 | 2015-03-01 | 2015-03-02 | | 9 | 8 | Subsidiaries | 10000 Wetline Blvd. | NULL | 1 | 2015-03-01 | 2015-03-02 | | 10 | 1 | Really Large Customers | 7500 Louise Dr. | 7500 Louise Dr. | 2 | 2015-03-02 | 2200-01-01 | | 11 | 2 | Small Stores | 2500 Woodland St. | 2500 Woodland St. | 2 | 2015-03-02 | 2200-01-01 | | 12 | 3 | Medium Retailers | 1111 Ritter Rd. | 1111 Ritter Rd. | 2 | 2015-03-02 | 2200-01-01 | | 13 | 4 | Good Companies | 9500 Scott St. | 9500 Scott St. | 2 | 2015-03-02 | 2015-03-27 | | 14 | 5 | Wonderful Shops | 3333 Rossmoyne Rd. | 3333 Rossmoyne Rd. | 2 | 2015-03-02 | 2200-01-01 | | 15 | 6 | Loyal Clients | 7777 Ritter Rd. | 7777 Ritter Rd. | 3 | 2015-03-02 | 2200-01-01 | | 16 | 7 | Distinguished Agencies | 9999 Scott St. | 9999 Scott St. | 2 | 2015-03-02 | 2200-01-01 | | 17 | 8 | Subsidiaries | 10000 Wetline Blvd. | 10000 Wetline Blvd. | 2 | 2015-03-02 | 2200-01-01 | | 18 | 9 | Online Distributors | 2323 Louise Dr. | 2323 Louise Dr. | 1 | 2015-03-02 | 2200-01-01 | | 22 | 13 | PA Customer | 1111 Louise Dr. | 1111 Louise Dr. | 1 | 2015-03-03 | 2200-01-01 | | 24 | 4 | Good Companies | 9999 Louise Dr. | 9999 Louise Dr. | 3 | 2015-03-27 | 2200-01-01 | | 25 | 15 | Super Stores | 1000 Woodland St. | 1000 Woodland St. | 1 | 2015-03-27 | 2200-01-01 | +-------------+-----------------+------------------------+-------------------------+---------------------+---------+----------------+-------------+ 21 rows in set (0.00 sec)
修改产品定期装载 类似于对定期数据仓库装载的修改,需要删除工厂维度导入里所有与邮编相关的列,并在产品事实表导入时使用工厂邮编代理键。 注意不再清空factory_stg表,因为只从factory.csv文件得到了增量的工厂源数据。你必须在过渡表里维护完整的现有工厂数据(包括邮编、城市、州)。因此,使用LOAD DATA INFILE命令的REPLACE选项。修改后的脚本如清单(五)-14-3所示。
USE dw;
-- 设置SCD的截止时间和生效时间
SET @pre_date = SUBDATE(CURRENT_DATE,1) ;
LOAD DATA INFILE '/root/data-integration/factory.csv'
REPLACE INTO TABLE factory_stg
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY ""
LINES TERMINATED BY '\n'
IGNORE 1 LINES
( factory_code
, factory_name
, factory_street_address
, factory_zip_code
, factory_city
, factory_state );
/* SCD1 */
UPDATE
factory_dim a
, factory_stg b
SET
a.factory_name = b.factory_name
, a.factory_street_address = b.factory_street_address
WHERE a.factory_code = b.factory_code;
/* add new factory */
INSERT INTO factory_dim
SELECT
NULL
, factory_code
, factory_name
, factory_street_address
, 1
, @pre_date
, '2200-01-01'
FROM factory_stg
WHERE factory_code NOT IN (
SELECT y.factory_code
FROM factory_dim x, factory_stg y
WHERE x.factory_code = y.factory_code );
INSERT INTO production_fact
SELECT
b.product_sk
, c.date_sk
, d.factory_sk
, e.factory_zip_code_sk
, production_quantity
FROM
source.daily_production a
, product_dim b
, date_dim c
, factory_dim d
, factory_zip_code_dim e
, factory_stg f
WHERE
production_date = @pre_date
AND a.product_code = b.product_code
AND a.production_date >= b.effective_date
AND a.production_date <= b.expiry_date
AND a.factory_code = f.factory_code
AND f.factory_zip_code = e.factory_zip_code
AND a.production_date >= e.effective_date
AND a.production_date <= e.expiry_date
AND a.production_date = c.date
AND a.factory_code = d.factory_code ;
COMMIT ;
清单(五)-14-3
图(五)- 14-17到图(五)- 14-28是对(五)进阶技术10. 多重星型模式中的Kettle产品定期装载作业的相应修改。
图(五)- 14-17
图(五)- 14-18
图(五)- 14-19
图(五)- 14-20
图(五)- 14-21
图(五)- 14-22
图(五)- 14-23
图(五)- 14-24
图(五)- 14-25
图(五)- 14-26
图(五)- 14-27
图(五)- 14-28
测试修改后的产品定期装载 在执行清单(五)-14-3里的脚本或相应的Kettle作业前,需要准备工厂源数据和日常产品源数据。下面的/root/data-integration/factory.csv文件包含新的工厂信息,在执行产品定期装载脚本时需要将其加到工厂维度里。 FACTORY_CODE,NAME,STREET_ADDRESS,ZIP CODE,CITY,STATE 5,Fifth Factory,90909 McNicholds Blvd.,17055,Pittsburgh,PA 然后使用下面的语句向daily_production表里添加三个日常产品记录。 INSERT INTO source.daily_production VALUES (1, '2015-03-27', 3, 400 ) , (3, '2015-03-27', 4, 200 ) , (5, '2015-03-27', 5, 100 ); commit; 现在执行清单(五)-14-3里的产品事实表定期装载脚本或相应的Kettle作业。 查询factory_dim,确认导入是正确的。查询语句和结果如下所示。 mysql> select * from factory_dim \G *************************** 1. row *************************** factory_sk: 1 factory_code: 1 factory_name: First Factory factory_street_address: 11111 Lichtman St. version: 1 effective_date: 2015-03-18 expiry_date: 2200-01-01 *************************** 2. row *************************** factory_sk: 2 factory_code: 2 factory_name: Second Factory factory_street_address: 24242 Bunty La. version: 1 effective_date: 2015-03-18 expiry_date: 2200-01-01 *************************** 3. row *************************** factory_sk: 3 factory_code: 3 factory_name: Third Factory factory_street_address: 37373 Burbank Dr. version: 1 effective_date: 2015-03-18 expiry_date: 2200-01-01 *************************** 4. row *************************** factory_sk: 4 factory_code: 4 factory_name: Fourth Factory factory_street_address: 44444 Jenzen Blvd. version: 1 effective_date: 2015-03-18 expiry_date: 2200-01-01 *************************** 5. row *************************** factory_sk: 5 factory_code: 5 factory_name: Fifth Factory factory_street_address: 90909 McNicholds Blvd. version: 1 effective_date: 2015-03-27 expiry_date: 2200-01-01 5 rows in set (0.00 sec) 注意 第五个工厂被正确添加。 查询production_fact表确认三个新的日常产品被正确装载。查询语句和结果如下所示。 mysql> select -> product_sk psk, -> production_date_sk pdsk, -> factory_sk fsk, -> factory_zip_code_sk fzsk, -> production_quantity qty -> from -> production_fact; +------+------+------+------+------+ | psk | pdsk | fsk | fzsk | qty | +------+------+------+------+------+ | 1 | 5556 | 4 | 6 | 100 | | 2 | 5556 | 3 | 1 | 200 | | 4 | 5556 | 2 | 6 | 300 | | 5 | 5556 | 1 | 1 | 400 | | 1 | 5556 | 1 | 1 | 400 | | 2 | 5556 | 2 | 6 | 300 | | 4 | 5556 | 3 | 1 | 200 | | 5 | 5556 | 4 | 6 | 100 | | 6 | 5565 | 3 | 1 | 400 | | 4 | 5565 | 4 | 6 | 200 | | 7 | 5565 | 5 | 6 | 100 | +------+------+------+------+------+ 11 rows in set (0.00 sec)