Sunday, February 26, 2012

Eliminate top and bottom percent of records

I have a temp table that is being populated and below is a representation of the data.Basically, for each mile_code and rep combination there will be ‘X’ number of records.(In the example below there are 30 records.)What I need to do is eliminate the top and bottom 10% (This could be any percentage and I want to be able to change that easily if I have to.) of DeltaT for each mile_code and rep combination and then average the remaining values in DeltaT for each mile_code and rep combination.

I’ve tried a bunch of different approaches and none give me what I’m looking for.Any ideas or solutions?

mile_code DeltaT rep close_date

Approach 1 Coleman 6/5/2007 16:52

Approach 1 COLEMAN 6/5/2007 14:32

Approach 1 COLEMAN 6/5/2007 9:26

Approach 21 COLEMAN 5/31/2007 14:07

Approach 14 COLEMAN 5/31/2007 9:53

Approach 7 Coleman 5/30/2007 16:12

Approach 36 COLEMAN 5/29/2007 15:53

Approach 8 COLEMAN 5/29/2007 13:54

Approach 8 COLEMAN 5/29/2007 13:23

Approach 8 COLEMAN 5/29/2007 9:00

… (20 more records)

Interview 1 Coleman 6/5/2007 16:52

Interview 1 COLEMAN 6/5/2007 14:32

Interview 1 COLEMAN 6/5/2007 9:26

Interview 8 COLEMAN 5/29/2007 13:54

Interview 8 COLEMAN 5/29/2007 9:00

Interview 14 COLEMAN 5/25/2007 8:37

Interview 61 COLEMAN 5/24/2007 10:57

Interview 14 COLEMAN 5/23/2007 9:32

Interview 26 COLEMAN 5/18/2007 9:39

Interview 23 Coleman 5/14/2007 13:25

… (20 more records)

Followed by 30 each of the other 3 milecodes for Coleman

Approach 0 DOLAN 6/6/2007 9:27

Approach 1 DOLAN 6/5/2007 13:39

Approach 1 DOLAN 6/5/2007 12:20

Approach 1 DOLAN 6/5/2007 11:37

Approach 92 DOLAN 6/5/2007 11:12

Approach 91 DOLAN 6/5/2007 9:44

Approach 2 Dolan 6/4/2007 11:53

Approach 2 DOLAN 6/4/2007 11:11

Approach 5 Dolan 6/1/2007 16:07

Approach 177 DOLAN 6/1/2007 10:58

… (20 more records)

Interview 96 DOLAN 6/5/2007 16:38

Interview 167 DOLAN 6/5/2007 14:24

Interview 2 Dolan 6/4/2007 14:54

Interview 2 DOLAN 6/4/2007 11:11

Interview 5 Dolan 6/1/2007 16:07

Interview 27 DOLAN 6/1/2007 14:48

Interview 323 DOLAN 6/1/2007 13:49

Interview 15 Dolan 6/1/2007 11:48

Interview 224 DOLAN 5/31/2007 16:40

Interview 126 Dolan 5/30/2007 15:02

… (20 more records)

Here is one way.

Code Snippet

select *

from tb t1

where t1.deltat in (select top 90 percent t2.deltat from tb t2 where t2.mile_code=t1.mile_code order by t2.deltat asc)

and t1.deltat in (select top 90 percent t3.deltat from tb t3 where t3.mile_code=t1.mile_code order by t3.deltat desc)

|||

What should be the criteria to eliminate the top and bottom 10 % of [DeltaT], other than [mile_code] and [rep]?

Should we sort by [mile_code], [rep] and [DeltaT] and then eliminate the top and bottom 10%?

AMB

|||I thought of this on the way to work this morning and realized the way I described what I wanted was incorrect. If the data is sorted by DeltaT for each mile_code and rep combination then in this example (30 records for each group and 10% gone from top to bottom) the first 3 records and last 3 records should be removed. So it doesn't matter what the values of DeltaT are. If there were fractions caused by the number of records chosen and/or the percentage then I'd want it to just round to the nearest whole number.|||

Try:

Code Snippet

createtable #t (

mile_code varchar(25)notnull,

rep varchar(25)notnull,

deltat intnotnull

)

go

declare @.i int, @.j int

declare @.mile_code varchar(25)

declare @.rep varchar(25)

set @.j = 1

while @.j <= 2

begin

set @.rep =

case

when @.j = 1 then'COLEMAN'

when @.j = 2 then'DOLAN '

end

set @.i = 1

while @.i <= 60

begin

set @.mile_code =

case

when @.i between 1 and 30 then'Approach'

when @.i between 31 and 60 then'Interview'

end

insertinto #t values(@.mile_code, @.rep,(abs(checksum(newid()))% 100)+ 1)

set @.i = @.i + 1

end

set @.j = @.j + 1

end

go

declare @.percent int

set @.percent = 10

;with cte

as

(

select

mile_code,

rep,

deltat,

((row_number()over(partitionby mile_code, rep orderby deltat))* 100.00)/(count(*)over(partitionby mile_code, rep))as pct

from

#t

)

select

mile_code,

rep,

avg(deltat)as avg_deltat

from

cte

where

pct between @.percent and (100.00 - @.percent)

groupby

mile_code,

rep

orderby

mile_code,

rep

go

droptable #t

go

AMB

|||

I am not perfectly clear about what you are trying to do, but here is a solution that removes the top 10 percent using a CTE and a couple of aggregates. Note this will only work in 2005...

;with rowCounts as (

select row_number() over (partition by mile_code,rep order by closeDate) as rowNum, --gives a row number per group, ordered by close date
count(*) over (partition by mile_code,rep) as totalRows, --gets total rows by mile_code and rep. This is how I do the work later
mile_code,rep, deltaT, closeDate
from #t) --this is the same as his #t, plus the close date. I will repeat it after this explanation

select mile_code,rep, deltaT, closeDate, rowNum, totalRows
from rowCounts
where rowNum > totalRows * (@.percent / 100.0) --removes the bottom 10 percent
and rowNum < totalRows - (totalRows * (@.percent / 100.0)) -- removes the top

The whole code (thanks to hunchback):

set nocount on
create table #t (
mile_code varchar(25) not null,
rep varchar(25) not null,
deltat int not null,
closeDate datetime
)
go
declare @.i int, @.j int
declare @.mile_code varchar(25)
declare @.rep varchar(25)
declare @.closeDate datetime
set @.j = 1
while @.j <= 2
begin
set @.rep =
case
when @.j = 1 then 'COLEMAN'
when @.j = 2 then 'DOLAN '
end
set @.i = 1

while @.i <= 60
begin
set @.mile_code =
case
when @.i between 1 and 22 then 'Approach'
when @.i between 23 and 60 then 'Interview'
end
insert into #t values(@.mile_code, @.rep, (abs(checksum(newid())) % 100) + 1,
dateadd(day,(abs(checksum(newid())) % 100) + 1,'20070101'))
set @.i = @.i + 1
end
set @.j = @.j + 1
end

go

declare @.percent int
set @.percent = 10
;with rowCounts as (
select row_number() over (partition by mile_code,rep order by closeDate) as rowNum,
count(*) over (partition by mile_code,rep) as totalRows,
mile_code,rep, deltaT, closeDate
from #t)
select mile_code,rep, deltaT, closeDate, rowNum, totalRows
from rowCounts
where rowNum > totalRows * (@.percent / 100.0)
and rowNum < totalRows - (totalRows * (@.percent / 100.0))

drop table #t

go

|||here's another alternative,

again, thanks to hunchback for the test data

create table #t (
mile_code varchar(25) not null,
rep varchar(25) not null,
deltat int not null
)
go

declare @.i int, @.j int
declare @.mile_code varchar(25)
declare @.rep varchar(25)
set @.j = 1

while @.j <= 2
begin
set @.rep =
case
when @.j = 1 then 'COLEMAN'
when @.j = 2 then 'DOLAN '
end

set @.i = 1

while @.i <= 60
begin
set @.mile_code =
case
when @.i between 1 and 30 then 'Approach'
when @.i between 31 and 60 then 'Interview'
end

insert into #t values(@.mile_code, @.rep, (abs(checksum(newid())) % 100) + 1)
set @.i = @.i + 1
end

set @.j = @.j + 1
end
go

SELECT IDENTITY(int,1,1) AS row_number
, *
INTO #a
FROM #t
ORDER BY
mile_code
, rep
, deltat

DECLARE @.percnt int
SET @.percnt = 10

SELECT a.mile_code
, a.rep
, a.deltat
, row_number - minrow AS grp_row_number
, maxrow * (@.percnt/100.0) as countpergrp
FROM #a a INNER JOIN
(
SELECT mile_code
, rep
, MIN(row_number) - 1 AS minrow
, COUNT(*) AS maxrow
FROM #a
GROUP BY
mile_code
, rep
) b on a.mile_code = b.mile_code
and a.rep = b.rep
WHERE row_number - minrow between (maxrow * (@.percnt/100.0)) and (maxrow - (maxrow * (@.percnt/100.0)))

DROP TABLE #t
DROP TABLE #a
GO|||

Thanks everyone. Still have a little more work to do to get the final result before putting it into producton (Have to add some other calculations but will do that by joining in another temp table or something.) but all of your input helped a lot. Here is what I ended up with thanks to your help:

DECLARE @.percent int,

@.recentindex int

SET @.percent = 10

SET @.recentindex = 45

Then crate temp table #miledata to get the last 'x' number of records for each rep in each mile_code that they have completed. Do this based off of the @.recentindex variable.

Then use either of the following to get results. Haven't decided which I'll put into production yet.

OPTION A

/*This creates a common table expression to run our aggregate query against.*/

;WITH cte_AveRecords AS

(

SELECT row_number() OVER(partition BY mile_code, rep ORDER BY deltaT) AS rowNum, --gives a row number per group, ordered by deltaT

COUNT(*) OVER (partition BY mile_code, rep) AS totalRows, --gets total rows by mile_code and rep.

mile_code,

rep,

deltaT,

recentindex

FROM#miledata

)

/*This deterimines the average per mile_code and rep by selecting the rows that fall above the low percent and less than or equal to the upper percent.*/

SELECT c.mile_code,

c.rep,

CAST(AVG(CAST(a.deltaT AS Decimal(5,1))) AS Decimal(5,1)) AS AvgTime

FROM

(

SELECT mile_code, rep, deltaT, recentindex, rowNum, totalRows

FROM cte_AveRecords

WHERE rowNum > totalRows * (@.percent / 100.0) --removes the bottom 'x' percent

AND rowNum < totalRows - (totalRows * (@.percent / 100.0)) -- removes the top

)a

JOIN cte_AveRecords c ON a.rep = c.rep AND a.mile_code = c.mile_code

GROUP BY c.mile_code,

c.rep

ORDER BY c.rep,

c.mile_code

OPTION B

/*This creates a common table expression to run our aggregate query against.*/

;WITH cte_AveRecords AS

(

SELECT

mile_code,

rep,

deltat,

/*This assigns a percentage to each row number in the mile_code and rep group.See explaination A below*/

((row_number() over(partition by mile_code, rep order by deltat)) * 100.00) / (count(*) over(partition by mile_code, rep)) as pct

FROM #miledata

)

/*This deterimines the average per mile_code and rep based on what we populated the CTE with..*/

SELECT mile_code,

rep,

CAST(AVG(CAST(deltat AS Decimal(5,1))) AS Decimal(5,1)) AS avg_deltat

FROM cte_AveRecords

WHERE pct > @.percent and pct < (100.00 - @.percent)

GROUP BY mile_code,

rep

ORDER BY rep,

mile_code

GO

No comments:

Post a Comment