While designing Data Warehouse, it is extremely important to confirm dimension strategy and the way of data analysis with customer. Let’s discuss the most common SCD methodologies (Type 1, 2).
SCD Type 1: Overwrite
Overwrite old record in dimension table, and therefore doesnt track historical data
SCD Type 2: Add New Row
Tracks historical data by adding new record to dimension table, managing “Valid From”, “Valid To” and “Is Active” flag.
Stored Procedure SCD1
--CREATE PROCEDURE
CREATE OR ALTER PROCEDURE [dwh].[LoadCustomer_SCD1] as
Begin
--DECLARE VARIABLES
declare @NewRows int
declare @UpdatedRows int
declare @today datetime = getdate()
--INSERT NEW EMAIL ADDRESSES
INSERT INTO [dwh].[Customer_SCD1] (
[EmailAddress]
,[FirstName]
,[LastName]
,[CompanyName]
,[Phone]
,[InsertDate]
,[UpdateDate])
SELECT s.[EmailAddress]
,s.[FirstName]
,s.[LastName]
,s.[CompanyName]
,s.[Phone]
,@today as [InsertDate]
,null as [UpdateDate]
FROM [SalesLT].[Customer_new] s
--FOLLOWING JOIN STATEMENT ALLOWS TO FILTER MISSING EMAIL ADDRESSES IN TARGET TABLE
LEFT JOIN [dwh].[Customer_SCD1] t
on s.[EmailAddress]=t.[EmailAddress] and t.EmailAddress is null
--GET NUMBER OF NEW RECORDS
select @NewRows =@@ROWCOUNT
--UPDATE DESCRIPTION OF EXISTING EMAIL ADDRESSES
UPDATE t SET
t.[FirstName]=s.[FirstName]
,t.[LastName]=s.[LastName]
,t.[CompanyName]=s.[CompanyName]
,t.[Phone]=s.[Phone]
,t.UpdateDate=@today
FROM [SalesLT].[Customer_new] s
INNER JOIN [dwh].[Customer_SCD1] t
on s.[EmailAddress]=t.[EmailAddress]
--FOLLOWING EXISTS STATEMENT FILTERS RECORDS WHERE DESCRIPTION OF EMAIL ADDRESS IS DIFFERENT BETWEEN SOURCE AND TARGET TABLE
where exists (
select HASHBYTES('md5', Concat(isnull(s.[FirstName],''),isnull(s.[LastName],''),isnull(s.[CompanyName],''),isnull(s.[Phone],'n/a')))
except
select HASHBYTES('md5', Concat(isnull(t.[FirstName],''),isnull(t.[LastName],''),isnull(t.[CompanyName],''),isnull(t.[Phone],'n/a')))
)
--GET NUMBER OF UPDATED RECORDS
select @UpdatedRows =@@ROWCOUNT
if @NewRows is null
set @NewRows = 0
if @UpdatedRows is null
set @UpdatedRows = 0
--DISPLAY AUDIT INFORMATION
print '-------- Inserted rows:' + cast (@NewRows as varchar(10))
print '-------- Updated rows:' + cast (@UpdatedRows as varchar(10))
END
GO
Stored Procedure SCD2
--CREATE PROCEDURE CREATE OR ALTER PROCEDURE [dwh].[LoadCustomer_SCD2] as Begin --DECLARE VARIABLES declare @NewRows int declare @UpdatedRows int declare @today datetime = getdate() --ADD NEW RECORDS IF EMAIL ADDRESS DOESN'T EXISTS IN TARGET TABLE OR DESCRIPTION OF EMAIL ADDRESS WAS CHANGED INSERT INTO [dwh].[Customer_SCD2] ( [EmailAddress] ,[FirstName] ,[LastName] ,[CompanyName] ,[Phone] ,[ValidFrom] ,[ValidTo] ,[Active] ,[hash_diff]) SELECT s.[EmailAddress] ,s.[FirstName] ,s.[LastName] ,s.[CompanyName] ,s.[Phone] ,@today as [ValidFrom] ,null as [ValidTo] ,1 as [Active] ,CONVERT(varchar(32),HASHBYTES('md5', Concat(isnull(s.[FirstName],''),isnull(s.[LastName],''),isnull(s.[CompanyName],''),isnull(s.[Phone],'n/a'))),(2)) AS [hash_diff] FROM [SalesLT].[Customer_new] s --FOLLOWING JOIN STATEMENTS FILTER EITHER NEW EMAIL_ADDRESSES OR DIFFERENT DESCRIPTION BETWEEN SOURCE AND TARGET TABLE LEFT JOIN [dwh].[Customer_SCD2] t on s.[EmailAddress]=t.[EmailAddress] and t.ValidTo is null where exists ( select CONVERT(varchar(32),HASHBYTES('md5', Concat(isnull(s.[FirstName],''),isnull(s.[LastName],''),isnull(s.[CompanyName],''),isnull(s.[Phone],'n/a'))),(2)) except select t.[hash_diff] ) --GET NUMBER OF INSERTED RECORDS (BOTH NEW AND UPDATED) select @NewRows =@@ROWCOUNT --UPDATE VALID_TO AND ACTIVE FLAG OF HISTORICAL RECORD BECAUSE THIS RECORD IS NOT ACTIVE ANYMORE UPDATE t SET t.ValidTo=s.ValidFrom ,t.Active=0 from [dwh].[Customer_SCD2] as t --FOLLOWING JOIN AND WHERE STATEMENTS FILTER PREVIOUS ACTIVE RECORDS inner join [dwh].[Customer_SCD2] s on t.[EmailAddress]=s.[EmailAddress] and t.ValidFrom<s.ValidFrom where t.ValidTo is null --GET NUMBER OF UPDATED RECORDS select @UpdatedRows =@@ROWCOUNT if @NewRows is null set @NewRows = 0 if @UpdatedRows is null set @UpdatedRows = 0 --DISPLAY AUDIT INFORMATION print '-------- Inserted rows:' + cast (@NewRows as varchar(10)) print '-------- Updated rows:' + cast (@UpdatedRows as varchar(10)) END GO