Wide World Importers DW – fact.Sale – Query – Top N Sales for each year

Background

Let us start issuing queries against the Data Warehouse DB, WideWorldImportersDW, that we are in the process of actualizing.

 

Lineage

  1. World Wide Importers – Using On SQL Server 2014
    Published On :- 2017-August-23rd
    Link
  2. WideWorldImportersDW – Fattening Up – fact.Sale
    Published On :- 2017-August-31sth
    Link

 

Sample Query

Scenario

The first query is quite popular as it will be return the “top N customers for each year“.  The column that will cast the deciding vote is profit.

 

Choices

Here are the various query patterns that we will try out:

 

Approach Group By Implication
Windowing Function InvoiceDateKeyYear, Customer Key Grouping by actual column
Correlated Join / Max datepart(year, tblFS.[Invoice Date Key]), Customer Key Function/ Computed Column
Correlated Join / Max InvoiceDateKeyYear, Customer Key Grouping by actual column

Queries

Query – Windowing Function / group by Actual Columns

SQL


set XACT_ABORT on
go

set nocount on
go

set statistics io on;
go

dbcc dropcleanbuffers with no_infomsgs
go

use [WideWorldImportersDW]
go

declare @dateStart1 datetime
declare @dateEnd1 datetime

declare @dateStart2 datetime
declare @dateEnd2 datetime

declare @dateStart3 datetime
declare @dateEnd3 datetime

declare @topN int
declare @divider char(1)

declare @tblYear TABLE
(
	  [id] int not null identity(1,1)
	, [year] int not null

	, primary key
	(
		[year]
	)
)

insert into @tblYear
([year])
select 2012
union
select 2013
union
select 2014
union
select 2015
union
select 2016

set @topN = 2
set @divider = '='

print replicate(@divider, 120)

set @dateStart1 = getdate()

; with cteYearCustomer --[Fact].[SaleLargeRowStore]
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
	, [rankID]
)
as
(
	select
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = tblFS.[profit]
			, [rankID] = rank()
							over
								(
									PARTITION by
										  tblFS.[InvoiceDateKeyYear]
										, tblFS.[Customer Key]

									order by
										tblFS.[profit] desc
								)

	from   [Fact].[SaleLargeRowStore] tblFS 

)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(
		    select distinct top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.[InvoiceDateKeyYear]

			and    tblFS.[rankID] = 1

			order by
					tblFS.[profit] desc

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd1 = getdate()

print replicate(@divider, 120)

set @dateStart2 = getdate()

; with cteYearCustomer -- [Fact].[SaleLargeColumnStoreClustered]
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
	, [rankID]
)
as
(
	select
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = tblFS.[profit]
			, [rankID] = rank()
							over
								(
									PARTITION by
										  tblFS.[InvoiceDateKeyYear]
										, tblFS.[Customer Key]

									order by
										tblFS.[profit] desc
								)

	from  [Fact].[SaleLargeColumnStoreClustered] tblFS

)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(

		    select distinct top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.InvoiceDateKeyYear

			and    tblFS.[rankID] = 1

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd2 = getdate()

print replicate(@divider, 120)

set @dateStart3 = getdate()

; with cteYearCustomer -- [Fact].[SaleLargeColumnStoreNonClustered]
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
	, [rankID]
)
as
(
	select
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = tblFS.[profit]
			, [rankID] = rank()
							over
								(
									PARTITION by
										  tblFS.[InvoiceDateKeyYear]
										, tblFS.[Customer Key]

									order by
										tblFS.[profit] desc
								)

	from  [Fact].[SaleLargeColumnStoreNonClustered] tblFS

)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(

		    select distinct top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.InvoiceDateKeyYear

			and    tblFS.[rankID] = 1

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd3 = getdate()

print replicate(@divider, 120)

declare @datediffSum  int

set @datediffSum =
					  datediff(second, @dateStart1, @dateEnd1)
					+ datediff(second, @dateStart2, @dateEnd2)
					+ datediff(second, @dateStart3, @dateEnd3)

select 

		  [sourceID] = [sourceID]
		,  = 
		, [dateStart] = @dateStart1
		, [dateEnd] = @dateEnd1
		, [duration] =  [duration]
		, [%] = cast([%] as decimal(6, 2))

from   (

	select
			  [sourceID] = 1
			,  = '[Fact].[SaleLargeRowStore]-Index Chosen By Engine'
			, [dateStart1] = @dateStart1
			, [dateEnd1] = @dateEnd1
			, [duration] = datediff(second, @dateStart1, @dateEnd1)
			, [%] = (datediff(second, @dateStart1, @dateEnd1)) * 100.00 / @datediffSum

	union

	select
			  [sourceID] = 2
			,  = '[Fact].[SaleLargeColumnStoreClustered]'
			, [dateStart2] = @dateStart2
			, [dateEnd2] = @dateEnd2
			, [duration2] = datediff(second, @dateStart2, @dateEnd2)
			, [%] = (datediff(second, @dateStart2, @dateEnd2)) * 100.00 / @datediffSum

	union

	select
			  [sourceID] = 3
			,  = '[Fact].[SaleLargeColumnStoreNonClustered]'
			, [dateStart3] = @dateStart3
			, [dateEnd3] = @dateEnd3
			, [duration3] = datediff(second, @dateStart3, @dateEnd3)
			, [%] = (datediff(second, @dateStart3, @dateEnd3)) * 100.00 / @datediffSum
	) tblA

Query Plan

Query Plan – Individual Queries Plan
Query Plan – RowStore Clustered Index

Query Plan – RowStore Clustered Index – Clustered Index Scan ( Clustered )

Query Operator – Properties

Explanation

  1. Actual Number of Rows :- 61,140,035 ( 61 million )
  2. Description :- Scanning a clustered index, entirely or only a range.
    • Table Scan
  3. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 254.633
    • Estimated Operator Cost :- 268.084
      • We can see most of cost is IO
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
  4. Number of Reads
    • Number :-
      • Estimated Number of Rows :- 12,228,000
      • Actual Number of Rows :- 61,140,000

 

Query Plan – RowStore Clustered Index – Warning – “Operator used tempdb to spill data
Query Plan – ColumnStore Clustered Index

Query Plan – Column Store Clustered Index – Clustered Index Scan ( Clustered )

Query Operator – Properties

Query Operator – Explanation
  1. Actual Number of Rows :- 61140025 ( 61 million )
  2. Description :- Scanning a clustered index, entirely or only a range.
    • Table Scan
  3. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 3.63868
    • Estimated Operator Cost :- 17.0896
      • IO Cost is 25% of CPU Cost
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
  4. Number of Reads
    • Number :-
      • Estimated Number of Rows :- 12228000
      • Actual Number of Rows :- 61140025

 

 

Query Plan – ColumnStore Non-Clustered Index

Query Plan – Column Store Clustered Index – NonClustered Index Scan ( Clustered )

Query Operator – Properties

Explanation

  1. Actual Number of Rows :- 61140025 ( 6.1 million )
  2. Description :- Scan rows from a table.
    • Table Scan
  3. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 254.747
    • Estimated Operator Cost :- 268.198
      • We can see most of cost is IO
      • IO if 95% of Total
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
  4. Number of Reads
    • Number :-
      • Estimated Number of Rows :- 12228000
      • Actual Number of Rows :- 61140025

 

Query Plan – ReadOut
  1. Operator :- Sort
    • On all of approaches the Sort Operator is the most expensive
    • Row Store Index
      • Operator used tempdb to spill data during execution with spill level 1 and 1 spilled thread(s), Sort wrote 476430 pages to and read 476430 pages from tempdb with granted memory 4845520KB and used memory 4845520 KB
      • High granted and used Memory
        • 4845520 KB
          • 4845.52 MB
          • 4.84552 GB
  2. Operator :- Sequence Project
    • Norm of Windowing Functions

 

Statistics I/O

Image

 

Tabulated

 

Table Type Table / Target IO Stats
Row Store – Clustered Index
SaleLargeRowStore Scan count 5, logical reads 1722395, physical reads 2, read-ahead reads 344480, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 158810, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Workfile Table ‘Workfile’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A3F66ED9’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Column Store – Clustered Index
SaleLargeColumnStoreClustered Table ‘SaleLargeColumnStoreClustered’. Scan count 5, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 10349, lob physical reads 0, lob read-ahead reads 0.

Table ‘SaleLargeColumnStoreClustered’. Segment reads 70, segment skipped 0.

Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A3F66ED9’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Column Store – Non Clustered Index
SaleLargeColumnStoreNonClustered Table ‘SaleLargeColumnStoreNonClustered’. Scan count 5, logical reads 1719525, physical reads 0, read-ahead reads 343604, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A3F66ED9’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

Explanation

  1. The Row Store Clustered Index & the Heap Column Store Index have very high I/O
  2. On the other hand, the Column Store – Clustered Index, have much lower I/O

 

Timing

Image

Explanation
  1. Duration
    • Percentile
      • The Column Stored Clustered Index is a bit better at 27%
      • While the Row Store Clustered Index & Column Store Heap are at 35 and 37%
    • Actual Time – Individual Queries
      • Row Store – Clustered Index
        • 433 seconds or 7.21 minutes
      • Column Store – Clustered Index
        • 330 seconds or 5.5 minutes
      • Column Stored – Non Clustered Index
        • 408 seconds or 6.8 minutes
    • Actual Time – All Queries
      • 19.09 minutes

 

Correlated Join / Max – group By Function [ datepart(year) ]

SQL


dbcc dropcleanbuffers with no_infomsgs
go

set nocount on
go

set statistics io on;
go

use [WideWorldImportersDW]
go

declare @dateStart1 datetime
declare @dateEnd1 datetime

declare @dateStart2 datetime
declare @dateEnd2 datetime

declare @dateStart3 datetime
declare @dateEnd3 datetime

declare @datediffSum  int

declare @topN int

declare @tblYear TABLE
(
	  [id] int not null identity(1,1)
	, [year] int not null

	, primary key
	(
		[year]
	)
)

insert into @tblYear
([year])
select 2012
union
select 2013
union
select 2014
union
select 2015
union
select 2016

set @topN = 2

print replicate('*', 120)

set @dateStart1 = getdate()

; with cteYearCustomer
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
)
as
(
	select
			  datepart(year, tblFS.[Invoice Date Key])
			 -- tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = max(tblFS.[profit])

	from   [Fact].[SaleLargeRowStore] tblFS --with ( INDEX =1)

	group by
			  datepart(year, tblFS.[Invoice Date Key])
			  --tblFS.[InvoiceDateKeyYear]
			, tblFS.[Customer Key]

)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(
		    select top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.[InvoiceDateKeyYear]

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd1 = getdate()

print replicate('*', 120)

set @dateStart2 = getdate()

; with cteYearCustomer
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
)
as
(
	select
			 datepart(year, tblFS.[Invoice Date Key])
			 -- tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = max(tblFS.[profit])

	from  [Fact].[SaleLargeColumnStoreClustered] tblFS

	group by
			  datepart(year, tblFS.[Invoice Date Key])
			  --tblFS.[InvoiceDateKeyYear]
			, tblFS.[Customer Key]

)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(
		    select top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.InvoiceDateKeyYear

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd2 = getdate()

print replicate('*', 120)

set @dateStart3 = getdate()

; with cteYearCustomer
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
)
as
(
	select
			  datepart(year, tblFS.[Invoice Date Key])
			  --tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = max(tblFS.[profit])

	from  [Fact].[SaleLargeColumnStoreNonClustered] tblFS

	group by
			  datepart(year, tblFS.[Invoice Date Key])
			 -- tblFS.[InvoiceDateKeyYear]
			, tblFS.[Customer Key]
)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(
		    select top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.InvoiceDateKeyYear

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd3 = getdate()

print replicate('*', 120)

set @datediffSum =
			      datediff(second, @dateStart1, @dateEnd1)
				+ datediff(second, @dateStart2, @dateEnd2)
				+ datediff(second, @dateStart3, @dateEnd3)

select 

		  [sourceID] = [sourceID]
		,  = 
		, [dateStart] = dateStart1
		, [dateEnd] = dateEnd1
		, [duration] =  [duration]
		, [%] = cast([%] as decimal(6, 2))

from
	(

		select
				  [sourceID] = 1
				,  = '[Fact].[SaleLargeRowStore]'
				, [dateStart1] = @dateStart1
				, [dateEnd1] = @dateEnd1
				, [duration] = datediff(second, @dateStart1, @dateEnd1)
				, [%] = (datediff(second, @dateStart1, @dateEnd1)) * 100.00 / @datediffSum

		union

		select
				  [sourceID] = 2
				,  = '[Fact].[SaleLargeColumnStoreClustered]'
				, [dateStart2] = @dateStart2
				, [dateEnd2] = @dateEnd2
				, [duration2] = datediff(second, @dateStart2, @dateEnd2)
				, [%] = (datediff(second, @dateStart2, @dateEnd2)) * 100.00 / @datediffSum

		union

		select
				  [sourceID] = 3
				,  = '[Fact].[SaleLargeColumnStoreNonClustered]'
				, [dateStart3] = @dateStart3
				, [dateEnd3] = @dateEnd3
				, [duration3] = datediff(second, @dateStart3, @dateEnd3)
				, [%] = (datediff(second, @dateStart3, @dateEnd3)) * 100.00 / @datediffSum

	) tblA

go

 

Query Plan

Query Plan – Individual Queries Plan
Query Plan – RowStore Clustered Index

Image

Explanation

  1. The Clustered Index Scan is the most expensive operation
  2. The Sort came in at 0

 

Query Plan – Column Store Clustered Index

Image

Explanation

  1. The Column Index Scan is at 49%
  2. The compute scalar is at 3%
    • Processing datepart(year, [InvoiceDate])
  3. The Filter ( for each year is at 16 %)
  4. And, the Hash match is 31%
  5. The Sort is only 0%

 

Operator – Compute Scaler

Image

Defined Values

Explanation

  1. [Expr1003] = Scalar Operator(datepart(year,[WideWorldImportersDW].[Fact].[SaleLargeColumnStoreClustered].[Invoice Date Key] as [tblFS].[Invoice Date Key]))
Operator – Filter

Image – Properties

Image – Properties – Filter By Predicate

Explanation

  1. [Expr1003] = Scalar Operator(datepart(year,[WideWorldImportersDW].[Fact].[SaleLargeColumnStoreClustered].[Invoice Date Key] as [tblFS].[Invoice Date Key]))

 

Query Plan – Column Store Non Clustered Index

Image

Explanation

  1. The Column Index Scan is at 94%
  2. The compute scalar is at 0%
    • Processing datepart(year, [InvoiceDate])
  3. No Filtering
    • It looks like we have just data rows that we need
  4. The Sort is only 0%

Statistics I/O

Image

Tabulated
Table Type Table / Target IO Stats

Row Store – Clustered Index
SaleLargeRowStore Table ‘SaleLargeRowStore’. Scan count 5, logical reads 1722395, physical reads 2, read-ahead reads 323184, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Workfile Table ‘Workfile’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A7C6FFBD’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Column Store – Clustered Index
SaleLargeColumnStoreClustered Table ‘SaleLargeColumnStoreClustered’. Scan count 5, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 13464, lob physical reads 0, lob read-ahead reads 0.

Table ‘SaleLargeColumnStoreClustered’. Segment reads 70, segment skipped 0.

Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A7C6FFBD’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Column Store – Non Clustered Index
SaleLargeColumnStoreNonClustered Table ‘SaleLargeColumnStoreNonClustered’. Scan count 5, logical reads 1719525, physical reads 0, read-ahead reads 342488, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A3F66ED9’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

Explanation

  1. The Row Store Clustered Index & the Heap Column Store Index have very high I/O
    • They are table scans, one is searching through a clustered index and the other through a heap
  2. On the other hand, the Column Store – Clustered Index, has much lower I/O
    • Likely based on the higher compression that is used for Column Store Indexes

 

Timing

Image

Explanation
  1. Duration
    • Percentile
      • The Column Stored Clustered Index is a bit better at 30%
      • The Row Store Clustered Index is at 37.78%
      • And, the Column Stored Heap is at 32.22%
    • Actual Time – Individual Queries
      • Row Store – Clustered Index
        • 27 seconds
      • Column Store – Clustered Index
        • 29 seconds
      • Column Stored – Non Clustered Index
        • 34 seconds
    • Actual Time – All Queries
      • 1.36 minutes
      • or 96 seconds

 

Correlated Join / Max – group By Actual Column – year

SQL


use [WideWorldImportersDW]
go

dbcc dropcleanbuffers with no_infomsgs
go

set nocount on
go

set statistics io on;
go

declare @dateStart1 datetime
declare @dateEnd1 datetime

declare @dateStart2 datetime
declare @dateEnd2 datetime

declare @dateStart3 datetime
declare @dateEnd3 datetime

declare @topN int

declare @tblYear TABLE
(
	  [id] int not null identity(1,1)
	, [year] int not null

	, primary key
	(
		[year]
	)
)

insert into @tblYear
([year])
select 2012
union
select 2013
union
select 2014
union
select 2015
union
select 2016

set @topN = 2

print replicate('*', 120)

print replicate('*', 120)

set @dateStart1 = getdate()

; with cteYearCustomer
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
)
as
(
	select
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = max(tblFS.[profit])

	from   [Fact].[SaleLargeRowStore] tblFS --with ( INDEX =1)

	group by
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, tblFS.[Customer Key]

)

select
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(
		    select top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.[InvoiceDateKeyYear]

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd1 = getdate()

print replicate('*', 120)

set @dateStart2 = getdate()

; with cteYearCustomer
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
)
as
(
	select
			--  datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = max(tblFS.[profit])

	from  [Fact].[SaleLargeColumnStoreClustered] tblFS

	group by
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, tblFS.[Customer Key]

)

select-- top 5 with ties
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

--from   cteYear
from   @tblYear cteY

cross apply
		(
		    select top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.InvoiceDateKeyYear

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd2 = getdate()

print replicate('*', 120)

set @dateStart3 = getdate()

; with cteYearCustomer
(
	  [InvoiceDateKeyYear]
	, [CustomerKey]
	, [profit]
)
as
(
	select
			  --datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, [CustomerKey] = tblFS.[Customer Key]
			, [profit] = max(tblFS.[profit])

	from  [Fact].[SaleLargeColumnStoreNonClustered] tblFS

	group by
			 -- datepart(year, tblFS.[Invoice Date Key])
			  tblFS.[InvoiceDateKeyYear]
			, tblFS.[Customer Key]
)

select-- top 5 with ties
		  cteYC.[InvoiceDateKeyYear]
	    , cteYC.[CustomerKey]
		, cteYC.[profit]

from   @tblYear cteY

cross apply
		(
		    select top (@topN) with ties *

			from   cteYearCustomer tblFS

			where  cteY.[year] = tblFS.InvoiceDateKeyYear

			order by tblFS.[profit]

		) cteYC

order by
		  cteYC.[InvoiceDateKeyYear]
		, cteYC.[profit] desc
		, cteYC.[CustomerKey]

set @dateEnd3 = getdate()

print replicate('*', 120)

declare @datediffSum  int

set @datediffSum =
					  datediff(second, @dateStart1, @dateEnd1)
					+ datediff(second, @dateStart2, @dateEnd2)
					+ datediff(second, @dateStart3, @dateEnd3)

select 

		  [sourceID] = [sourceID]
		,  = 
		, [dateStart] = dateStart1
		, [dateEnd] = dateEnd1
		, [duration] =  [duration]
		, [%] = cast([%] as decimal(6, 2))

from   (

	select
			  [sourceID] = 1
			,  = '[Fact].[SaleLargeRowStore]'
			, [dateStart1] = @dateStart1
			, [dateEnd1] = @dateEnd1
			, [duration] = datediff(second, @dateStart1, @dateEnd1)
			, [%] = (datediff(second, @dateStart1, @dateEnd1)) * 100.00 / @datediffSum

	union

	select
			  [sourceID] = 2
			,  = '[Fact].[SaleLargeColumnStoreClustered]'
			, [dateStart2] = @dateStart2
			, [dateEnd2] = @dateEnd2
			, [duration2] = datediff(second, @dateStart2, @dateEnd2)
			, [%] = (datediff(second, @dateStart2, @dateEnd2)) * 100.00 / @datediffSum

	union

	select
			  [sourceID] = 3
			,  = '[Fact].[SaleLargeColumnStoreNonClustered]'
			, [dateStart3] = @dateStart3
			, [dateEnd3] = @dateEnd3
			, [duration3] = datediff(second, @dateStart3, @dateEnd3)
			, [%] = (datediff(second, @dateStart3, @dateEnd3)) * 100.00 / @datediffSum
	) tblA

Query Plan

Query Plan – Individual Queries Plan
Query Plan – RowStore Clustered Index
Image

Explanation
  1. The Clustered Index Scan takes up 94%
  2. Sort is 0%
  3. We are told of the missing Index
Image – Operator – Clustered Index Scan
  1. Description :- Scanning a clustered index, entirely or only a range.
    • Table Scan
  2. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 254.633
    • Estimated Operator Cost :- 268.084
      • We can see most of cost is IO
  3. Number of Executions
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
    • Actual Number of Executions :- 5
      • Number of records in Table variables ( year )
  4. Number of Reads
    • Estimated Number of Rows :-
      • 12228000 ( 12 million )
    • Number of Rows Reads :-
      • 61140025 ( 61 million )
        • Number of Record in Table :- 12 million
        • Number of batches :- 5
Image – Operator – Hash Matched – Defined Values

Image – Hash Matched – Defined Values

Explanation
  1. Grouping by InvoiceDateKeyYear

 

Image – Missing Index

Explanation
  1. We are told, actually warned, of a missing index
  2. The Index definition is “[Fact].[SaleLargeRowStore] ([InvoiceDateKeyYear]) INCLUDE ([Customer Key],[Profit])
  3. And, so we are asked to look into having an index keyed on our grouping column ( InvoiceDateKeyYear )
  4. And, the other columns that we are referencing ( Customer Key and profit) included

 

Query Plan – RowStore Clustered Index – Clustered Index Scan ( Clustered )

Query Operator – Properties

Explanation

  1. Actual Number of Rows :- 61140025 ( 6.1 million )
  2. Description :- Scanning a clustered index, entirely or only a range.
    • Table Scan
  3. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 254.633
    • Estimated Operator Cost :- 268.084
      • We can see most of our cost is IO
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
  4. Number of Reads
    • Number :-
      • Estimated Number of Rows :- 12228000
      • Actual Number of Rows :- 61140025
    • Explanation :-
      • Estimate is just
      • Likely because of predicate push down we ended up reading only half the full number of records

 

Query Plan – RowStore Clustered Index – Warning – “Operator used tempdb to spill data
Query Plan – ColumnStore Clustered Index

Explanation
  1. The Clustered Index Scan takes up 46%
  2. Missing Index
    • We are told of the missing Index
    • Impact of missing index :- 40%
  3. Hash Match Aggregate Operator
    • Hash Match operator use in calculating max
  4. Filter Operator
    • Filter on Year
  5. Sort is 0%

 

Query Plan – Column Store Clustered Index – Clustered Index Scan ( Clustered )

Query Operator – Properties

Query Operator – Explanation
  1. Actual Number of Rows :- 61140025 ( 6.1 million )
  2. Description :- Scanning a clustered index, entirely or only a range.
    • Table Scan
  3. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 3.63868
    • Estimated Operator Cost :- 17.0896
      • IO Cost is 25% of CPU Cost
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
  4. Number of Reads
    • Number :-
      • Estimated Number of Rows :- 12228000
      • Actual Number of Rows :- 61140025
    • Explanation :-
      • Estimate is just
      • Likely because of predicate push down we ended up reading only half the full number of records

 

 

Query Plan – ColumnStore Non-Clustered Index

Image

Explanation
  1. The Clustered Index Scan takes up 94%
  2. Hash Match Aggregate Operator
    • takes up 6%
    • Hash Match operator use in calculating max
  3. Sort is 0%
Query Plan – Column Store Clustered Index – NonClustered Index Scan ( Clustered )

Query Operator – Properties – Columnstore Index Scan

Explanation

  1. Actual Execution Mode
    • Row
  2. Description
    • Scan a columnstore index, entirely or only a range.
  3. Cost
    • Estimated CPU Cost :- 13.451
    • Estimated IO Cost :- 3.63868
    • Estimated Operator Cost :- 17.0896
  4. Number of Executions
    • Estimated Number of Executions :-1
      • The controller is a year table variable
      • For Table variable, # of records is assumed to 1
  5. Number of Reads
    • Estimated Number of Rows :- 12228000 ( 12 million )
    • Actual Number of Rows :- 12228000 ( 12 million )
    • Number of Rows Read :- 61140025 ( 61 million )

Statistics I/O

Image

Tabulated
Table Type Table / Target IO Stats

Row Store – Clustered Index
SaleLargeRowStore Table ‘SaleLargeRowStore’. Scan count 5, logical reads 1722395, physical reads 2, read-ahead reads 344480, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Workfile Table ‘Workfile’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A7C6FFBD’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Column Store – Clustered Index
SaleLargeColumnStoreClustered Table ‘SaleLargeColumnStoreClustered’. Scan count 5, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 10349, lob physical reads 0, lob read-ahead reads 0.

Table ‘SaleLargeColumnStoreClustered’. Segment reads 70, segment skipped 0.

Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A7C6FFBD’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Column Store – Non Clustered Index
SaleLargeColumnStoreNonClustered Table ‘SaleLargeColumnStoreNonClustered’. Scan count 5, logical reads 1719525, physical reads 0, read-ahead reads 343604, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Worktable Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table Variable – Year Table ‘#A3F66ED9’. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

Explanation

  1. The Row Store Clustered Index & the Heap Column Store Index have very high I/O
    • They are table scans, one is searching through a clustered index and the other through a heap
  2. On the other hand, the Column Store – Clustered Index, has much lower I/O
    • Likely based on the higher compression that is used for Column Store Indexes

 

Timing

Image

Explanation
  1. Duration
    • Percentile
      • The Column Stored Clustered Index is a bit better at 30.77%
      • The Row Store Clustered Index is at 35.38%
      • And, the Column Stored Heap is at 33.85%
    • Actual Time – Individual Queries
      • Row Store – Clustered Index
        • 23 seconds
      • Column Store – Clustered Index
        • 20 seconds
      • Column Stored – Non Clustered Index
        • 22 seconds
    • Actual Time – All Queries
      • 1.06 minutes
      • or 66 seconds

 

Source Code Control

GitHub

WideWorldImportersDWFactSale/Query/QueryMaxByYear.InvoiceDate/
Link

 

Methodology Group By Filename
Windowing Function Group by InvoiceDateKeyYear, CustomerKey queryMaxByYearCorrelatedJoin_GroupByActualColumn.sql
Correlated Join / Max Group by Datepart(year, InvoiceDateKey), CustomerKey queryMaxByYearCorrelatedJoin_GroupByFunctionDatePartYear.sql
Correlated Join / Max Group by InvoiceDateKeyYear, CustomerYear queryMaxByYearWindowsFunction_GroupByActualColumn.sql

 

References

  1. Windowing Functions
    • Dwain Camps
      • The Performance of the T-SQL Window Functions
        Link

 

Summary

Thankfully, there is a lot here.

We only looked at two ways of identifying the Max Record; and those two approaches are Window Functions and Correlated Max.

The Windowing Function seems to be more expensive.

The Column Store Clustered Index is the most performant.

Likely specific Row Store Indexes would have been better; but we did not take the time to create one.

Still have yet to find queries that will make us choose a Non-Clustered Column Index over a Clustered Column Index.

 

One thought on “Wide World Importers DW – fact.Sale – Query – Top N Sales for each year

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s