Percentage of Values for Top 3 from a Character Field - tsql

I have an unusual situation. Please consider the following code:
IF OBJECT_ID('tempdb..#CharacterTest') IS NOT NULL
DROP TABLE #CharacterTest
CREATE TABLE #CharacterTest
(
[ID] int IDENTITY(1, 1) NOT NULL,
[CharField] varchar(50) NULL
)
INSERT INTO #CharacterTest (CharField)
VALUES ('A')
, ('A')
, ('A')
, ('A')
, ('B')
, ('B')
, ('B')
, ('C')
, ('C')
, ('D')
, ('D')
, ('F')
, ('G')
, ('H')
, ('I')
, ('J')
, ('K')
, ('L')
, ('M')
, ('N')
, (' ')
, (' ')
, (' ')
, (NULL)
, ('');
I would like a query which gives me a character string like this:
A (16%), B (12%), C(8%)
Please notice the following:
I don't want to have empty strings, strings with all blanks, or nulls listed in the top 3, but I do want the percentage of values calculated using the entire record count for the table.
Ties can be ignored, so if there were 22 values in the list with 8% frequency, it's alright to simply return whichever one is first.
Percentages can be rounded to whole numbers.
I'd like to find the easiest way to write this query while still retaining T-SQL compatibility back to SQL Server 2005. What is the best way to do this? Window Functions?

I'd go for.
WITH T1
AS (SELECT [CharField],
100.0 * COUNT(*) OVER (PARTITION BY [CharField]) /
COUNT(*) OVER () AS Pct
FROM #CharacterTest),
T2
AS (SELECT DISTINCT TOP 3 *
FROM T1
WHERE [CharField] <> '' --Excludes all blank or NULL as well
ORDER BY Pct DESC)
SELECT STUFF((SELECT ',' + [CharField] + ' (' + CAST(CAST(ROUND(Pct,1) AS INT) AS VARCHAR(3)) + ')'
FROM T2
ORDER BY Pct DESC
FOR XML PATH('')), 1, 1, '') AS Result

My first attempt would probably be this. Not saying that it's the best way to handle it, but that it would work.
DECLARE #TotalCount INT
SELECT #TotalCount = COUNT(*) FROM #CharacterTest AS ct
SELECT TOP(3) CharField, COUNT(*) * 1.0 / #TotalCount AS OverallPercentage
FROM #CharacterTest AS ct
WHERE CharField IS NOT NULL AND REPLACE(CharField, ' ', '') <> ''
GROUP BY CharField
ORDER BY COUNT(*) desc
DROP TABLE #CharacterTest

This should get the character string you need:
declare #output varchar(200);
with cte as (
select CharField
, (count(*) * 100) / (select count(*) from #CharacterTest) as CharPct
, row_number() over (order by count(*) desc, CharField) as RowNum
from #CharacterTest
where replace(CharField, ' ', '') not like ''
group by CharField
)
select #output = coalesce(#output + ', ', '') + CharField + ' (' + cast(CharPct as varchar(11)) + '%)'
from cte
where RowNum <= 3
order by RowNum;
select #output;
-- Returns:
-- A (16%), B (12%), C (8%)
I would draw attention to storing a single character in a varchar(50) column, however.

Related

How to move the data to the next line based on spaces in sqlserver 2008 R2

Input : Keep the column value into next line if word to word space is 3 space and length of the word is >9 .
declare #Table table(CL1 varchar(50))
INSERT INTO #Table
SELECT 'Ohh my GOD'
UNION ALL
SELECT 'hindunewspaer is no1 paper'
select * from #Table
o/p :
CL1
ohh
my god
hindunewpaer
is no1 paper
Used a Split/Parse function. Can be inline if needed.
EDIT - Switch to a Parser which is not limited to 8K because the final
string could easily be larger than 8K
Example
;with cte0 as (
Select Seq=Row_Number() over (Order by (Select null)),RetSeq,RetVal
From #Table A
Cross Apply (
Select RetSeq
,RetVal=case when len(RetVal)>9 then '~~~' else '' end+RetVal+case when len(RetVal)>9 then '~~~' else '' end
From [dbo].[udf-Str-Parse](Replace(CL1,' ','~~~ '),' ')
) B ),
cte1 as ( Select S=Stuff((Select ' '+RetVal From cte0 Order by Seq For XML Path ('')),1,1,'') )
Select CL1 = RetVal
From cte1 A
Cross Apply [dbo].[udf-Str-Parse](A.S,'~~~') B
Order By RetSeq
Returns
CL1
Ohh
my GOD
hindunewspaer
is no1 paper
The Split/Parse Function if Needed
CREATE FUNCTION [dbo].[udf-Str-Parse] (#String varchar(max),#Delimiter varchar(10))
Returns Table
As
Return (
Select RetSeq = Row_Number() over (Order By (Select null))
,RetVal = LTrim(RTrim(B.i.value('(./text())[1]', 'varchar(max)')))
From (Select x = Cast('<x>' + replace((Select replace(#String,#Delimiter,'§§Split§§') as [*] For XML Path('')),'§§Split§§','</x><x>')+'</x>' as xml).query('.')) as A
Cross Apply x.nodes('x') AS B(i)
);
--Thanks Shnugo for making this XML safe
--Select * from [dbo].[udf-Str-Parse]('Dog,Cat,House,Car',',')
--Select * from [dbo].[udf-Str-Parse]('John Cappelletti was here',' ')
--Select * from [dbo].[udf-Str-Parse]('this,is,<test>,for,< & >',',')

Query to find duplicate rows in a table

I am running the following query which is terribly inefficient and can take hours. I am having SQL brain farts today and I do not know how to improve this query. There are several nullable varchar fields, and I need to identify the duplicate rows (all columns containing identical values as another row)
select * from transactions x where exists (
select Coalesce(ColA, ''),
Coalesce(ColB, ''),
Coalesce(ColC, '')
from transactions y
where Coalesce(x.ColA, '') = Coalesce(x.ColA, '') and
Coalesce(x.ColB, '') = Coalesce(x.ColB, '') and
Coalesce(x.ColC, '') = Coalesce(x.ColC, '')
group by Coalesce(ColA, ''),
Coalesce(ColB, ''),
Coalesce(ColC, '')
having count(*) > 1
)
Why does this take so long to run? There has to be a better way.
You could improve it by
removing unnecesssary checks
putting a composite index on ColA, ColB and ColC
What is unnecessary? It seems to be unnecessary to join the table with itself. Why don't you use a simple GROUP BY? You also don't need the WHERE:
SELECT COALESCE(ColA, '') AS ColA,
COALESCE(ColB, '') AS ColB,
COALESCE(ColC, '') AS ColC,
Count(*) As Cnt
FROM transactions t
GROUP BY COALESCE(ColA, ''), COALESCE(ColB, ''), COALESCE(ColC, '')
HAVING Count(*) > 1
Does this work?
DECLARE #transactions TABLE (
ColA INT
, ColB INT
, ColC INT
, ColD INT
, ColE INT
, ColF INT
)
DECLARE #Counter1 INT = 0
WHILE #Counter1 < 10000
BEGIN
SET #Counter1 += 1
INSERT INTO #transactions
SELECT ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
, ROUND(RAND()*10,0)
END
;WITH Dupe
AS (
SELECT *, ROW_NUMBER() OVER
(PARTITION BY ColA, ColB, ColC, ColD, ColE, ColF
ORDER BY ColA, ColB, ColC, ColD, ColE, ColF) AS rn
FROM #transactions
)
SELECT * FROM Dupe WHERE rn > 1
You can use an ISNULL on anything where you need to compare a value that might be null. Note that most of this I've written is just to generate a useful data set. With 6 columns and 10,000 rows I got 42 identical rows in less than a second. No triples. Bumped it up to 100,000 rows and I got 3,489 duplicate rows, including some triples. Took 3 seconds.
Here's an example using text. This whole thing took 25 seconds on 100,000 records, although my timer shows that less than 4 of that was finding the duplicates, with the remainder being the table population.
DECLARE #transactions2 TABLE (
ColA NVARCHAR(30)
, ColB NVARCHAR(30)
, ColC NVARCHAR(30)
, ColD NVARCHAR(30)
, ColE NVARCHAR(30)
, ColF NVARCHAR(30)
)
DECLARE #names TABLE (
ID INT IDENTITY
, Name NVARCHAR(30)
)
DECLARE #Counter2 INT = 0
, #ColA NVARCHAR(30)
, #ColB NVARCHAR(30)
, #ColC NVARCHAR(30)
, #ColD NVARCHAR(30)
, #ColE NVARCHAR(30)
, #ColF NVARCHAR(30)
INSERT INTO #names VALUES
('Anderson, Arthur')
, ('Broberg, Bruce')
, ('Chan, Charles')
, ('Davidson, Darwin')
, ('Eggert, Emily')
, ('Fox, Francesca')
, ('Garbo, Greta')
, ('Hollande, Hortense')
, ('Iguadolla, Ignacio')
, ('Jackson, Jurimbo')
, ('Katana, Ken')
, ('Lawrence, Larry')
, ('McDonald, Michael')
, ('Nyugen, Nathan')
, ('O''Dell, Oliver')
, ('Peterson, Phillip')
, ('Quigley, Quentin')
, ('Ramallah, Rodolfo')
, ('Smith, Samuel')
, ('Turner, Theodore')
, ('Uno, Umberto')
, ('Victor, Victoria')
, ('Wallace, William')
, ('Xing, Xiopan')
, ('Young, Yvette')
, ('Zapata, Zorro')
, (NULL)
WHILE #Counter2 < 100000
BEGIN
SET #Counter2 += 1
SET #ColA = (SELECT Name FROM #names WHERE ID = ROUND(RAND()*27 +.5,0))
SET #ColB = (SELECT Name FROM #names WHERE ID = ROUND(RAND()*27 +.5,0))
SET #ColC = (SELECT Name FROM #names WHERE ID = ROUND(RAND()*27 +.5,0))
SET #ColD = (SELECT Name FROM #names WHERE ID = ROUND(RAND()*27 +.5,0))
SET #ColE = (SELECT Name FROM #names WHERE ID = ROUND(RAND()*27 +.5,0))
SET #ColF = (SELECT Name FROM #names WHERE ID = ROUND(RAND()*27 +.5,0))
INSERT INTO #transactions2
SELECT #ColA, #ColB, #ColC, #ColD, #ColE, #ColD
END
PRINT CAST(GETDATE() AS DateTime2 (3))
;WITH Dupe
AS (
SELECT *, ROW_NUMBER() OVER
(PARTITION BY ISNULL(ColA,''), ISNULL(ColB,''), ISNULL(ColC,''), ISNULL(ColD,''), ISNULL(ColE,''), ISNULL(ColF,'')
ORDER BY ISNULL(ColA,''), ISNULL(ColB,''), ISNULL(ColC,''), ISNULL(ColD,''), ISNULL(ColE,''), ISNULL(ColF,'')) AS rn
FROM #transactions2
)
SELECT * FROM Dupe WHERE rn > 1 ORDER BY rn
PRINT CAST(GETDATE() AS DateTime2 (3))
Here is a much faster way using a subquery join. It ran in under 10 seconds
select * from transactions x
join (
select Coalesce(ColA, ''),
Coalesce(ColB, ''),
Coalesce(ColC, '')
from transactions
group by Coalesce(ColA, ''),
Coalesce(ColB, ''),
Coalesce(ColC, '')
having count(*) > 1
) dups on
dups.ColA = x.ColA and
dups.ColB = x.ColB and
dups.ColC = x.ColC
The important thing about this query is that it returns both/all rows, not just the duplicate(s)
If this is a one time job, and involves a huge number of rows, and not to be made as a View, then perhaps you'd opt to INSERT SELECT it into a table with UNIQUE index with IGNORE_DUP_KEY option.

TSQL Pivoting Issue - looking for better approach

This is a T-SQL related question. I am using SQL Server 2012.
I have a table like this:
I would like to have output like this:
Explanation:
For each employee, there will be a row. An employee has one or more assignments. Batch Id specifies this. Based on the batch Id, the column names will change (e.g. Country 1, Country 2 etc.).
Approach so far:
Un-pivot the source table like the following:
select
EmpId, 'Country ' + cast(BatchId as varchar) as [ColumnName],
Country as [ColumnValue]
from
SourceTable
UNION
select
EmpId, 'Pass ' + cast(BatchId as varchar) as [ColumnName],
Pass as [ColumnValue]
from
SourceTable
which gives each column's values as rows. Then, this result can be pivoted to get the desired output.
Questions:
Is there a better way of doing this?
At the moment, I know there will be fixed amount of batches, but, for future, if I like to make the pivoting part dynamic, what is the best approach?
Using tools like SSIS or SSRS, is it easier to handle the pivot dynamically?
Screw doing it in SQL.
Let SSRS do the work for you with a MATRIX. It will PIVOT for you without having to create dynamic SQL to handle the terrible limitation of needing to know all the columns.
For your data, you would have EMP ID as the ROW Group and PASS as your column grouping.
https://msdn.microsoft.com/en-us/library/dd207149.aspx
There are many possible solutions to achieve what you want (search for Dynamic Pivot on multiple columns)
SqlFiddleDemo
Warning: I assume that columns Country and Pass are NOT NULL
CREATE TABLE SourceTable(EmpId INT, BatchId INT,
Country NVARCHAR(100) NOT NULL, Pass NVARCHAR(5) NOT NULL);
INSERT INTO SourceTable(EmpId, BatchId, Country, Pass)
VALUES
(100, 1, 'UK', 'M'), (200, 2, 'USA', 'U'),
(100, 2, 'Romania', 'M'), (100, 3, 'India', 'MA'),
(100, 4, 'Hongkong', 'MA'), (300, 1, 'Belgium', 'U'),
(300, 2, 'Poland', 'U'), (200, 1, 'Australia', 'M');
/* Get Number of Columns Groups Country1..Country<MaxCount> */
DECLARE #max_count INT
,#sql NVARCHAR(MAX) = ''
,#columns NVARCHAR(MAX) = ''
,#i INT = 0
,#i_s NVARCHAR(10);
WITH cte AS
(
SELECT EmpId
,[cnt] = COUNT(*)
FROM SourceTable
GROUP BY EmpId
)
SELECT #max_count = MAX(cnt)
FROM cte;
WHILE #i < #max_count
BEGIN
SET #i += 1;
SET #i_s = CAST(#i AS NVARCHAR(10));
SET #columns += N',MAX(CASE WHEN [row_no] = ' + #i_s + ' THEN Country END) AS Country' + #i_s +
',MAX(CASE WHEN [row_no] = ' + #i_s + ' THEN Pass END) AS Pass' + #i_s;
END
SELECT #sql =
N';WITH cte AS (
SELECT EmpId, Country, Pass, [row_no] = ROW_NUMBER() OVER (PARTITION BY EmpId ORDER BY BatchId)
FROM SourceTable)
SELECT EmpId ' + #columns + N'
FROM cte
GROUP BY EmpId';
/* Debug */
/* SELECT #sql */
EXEC(#sql);
Or:
SQLFiddleDemo2
DECLARE #cols NVARCHAR(MAX),
#sql NVARCHAR(MAX) = '';
;WITH cte(col_name, rn) AS(
SELECT DISTINCT col_name = col_name + CAST(BatchId AS VARCHAR(10)),
rn = ROW_NUMBER() OVER(PARTITION BY EmpId ORDER BY BatchId)
FROM SourceTable
CROSS APPLY (VALUES ('Country', Country), ('Pass', Pass)) AS c(col_name, val)
)
SELECT #cols = STUFF((SELECT ',' + QUOTENAME(col_name)
FROM cte
ORDER BY rn /* If column order is important for you */
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
, 1, 1, '');
SET #sql =
N';WITH cte AS
(
SELECT EmpId, col_name = col_name + CAST(BatchId AS VARCHAR(10)), val
FROM SourceTable
CROSS APPLY (VALUES (''Country'', Country), (''Pass'', Pass)) AS c(col_name, val)
)
SELECT *
FROM cte
PIVOT
(
MAX(val)
FOR col_name IN (' + #cols + ')
) piv';
EXEC(#sql);

Sort data before concatenating using STUFF FOR XML

I have the following query that I am using for an SSRS Report:
SELECT ROW_NUMBER() OVER ( ORDER BY Judge.EventJudgeID ) AS JudgeRow ,
Judge.EventID ,
Judge.Judge_PersonID ,
STUFF(( SELECT DISTINCT
',' + CAST(Fights.FightNumber AS VARCHAR(MAX)) AS [text()]
FROM dbo.tblFights Fights ,
dbo.tblFightJudge FRJudge
WHERE Fights.FightID = FRJudge.fightid
AND ( Judge.Judge_PersonID = FRJudge.judge1id
OR Judge.Judge_PersonID = FRJudge.judge2id
OR Judge.Judge_PersonID = FRJudge.judge3id
)
FOR
XML PATH('')
), 1, 1, '') AS BoutsJudged ,
Persons.LastName + ' ' + Persons.FirstName AS JudgeName ,
Events.EventName ,
Events.EventDate
FROM dbo.tblEventJudge Judge
INNER JOIN dbo.tblPersons Persons ON PersonID = Judge_PersonID
INNER JOIN dbo.tblEvents Events ON Events.EventID = Judge.EventID
WHERE Judge.EventID = 1278;
The problem is that the STUFF command returns the following string:
1,10,11,12,13,14,15,16,17,18,19,2,3,4,5,6,7,8,9
How can I make it sort the numbers before concatenating it into a string?
Try this
SELECT ROW_NUMBER() OVER ( ORDER BY Judge.EventJudgeID ) AS JudgeRow ,
Judge.EventID ,
Judge.Judge_PersonID ,
STUFF(Select ',' + CAST(Fights.FightNumber AS VARCHAR(MAX)) AS [text()] From ( SELECT DISTINCT Fights.FightNumber
FROM dbo.tblFights Fights ,
dbo.tblFightJudge FRJudge
WHERE Fights.FightID = FRJudge.fightid
AND ( Judge.Judge_PersonID = FRJudge.judge1id
OR Judge.Judge_PersonID = FRJudge.judge2id
OR Judge.Judge_PersonID = FRJudge.judge3id
)
) X
ORDER BY Fights.FightNumber
FOR
XML PATH('')
), 1, 1, '') AS BoutsJudged ,
Persons.LastName + ' ' + Persons.FirstName AS JudgeName ,
Events.EventName ,
Events.EventDate
FROM dbo.tblEventJudge Judge
INNER JOIN dbo.tblPersons Persons ON PersonID = Judge_PersonID
INNER JOIN dbo.tblEvents Events ON Events.EventID = Judge.EventID
WHERE Judge.EventID = 1278;
You can check below sqls,
Before :
Select *,
STUFF((Select Distinct ','+Cast(high as varchar(MAX))
from master..spt_values where type = 'p' and number < 20
for xml Path('')),1,1,'')
from master..spt_values where type = 'p' and number < 20
After :
Select *,
STUFF((Select ','+Cast(high as varchar(MAX)) from (Select distinct high
from master..spt_values where type = 'p' and number < 20) x Order by high for xml Path('')),1,1,'')
from master..spt_values where type = 'p' and number < 20
I apologize for this solution being pedantic, but I have a hard time parsing code and need to see things in steps. Also, Microsoft adds a feature to do this in the 2012 release, but this code should work in most releases. First, use a database open to users in most SQLServers...
USE MASTER; SELECT TOP 3 TABLE_NAME, COLUMN_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS;
/*TABLE_NAME COLUMN_NAME ORDINAL_POSITION
spt_fallback_db xserver_name 1
spt_fallback_db xdttm_ins 2
spt_fallback_db xdttm_last_ins_upd 3
*/
Now, breaking down this approach (to sorting a list within a column)...
(1) Adding FOR XML PATH('') to a 1 column query pivots it to one row, but adds XML tags for the column header...
SELECT TOP 3 COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS FOR XML PATH('');
/*<COLUMN_NAME>xserver_name</COLUMN_NAME><COLUMN_NAME>xdttm_ins</COLUMN_NAME><COLUMN_NAME>xdttm_last_ins_upd</COLUMN_NAME>*/
(2) Concatination nullifies the column header, eliminating the tags. Any string will work, I want comma space...
SELECT TOP 3 ', ' + COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS FOR XML PATH('');
/*, xserver_name, xdttm_ins, xdttm_last_ins_upd*/
(3) Other columns will need their own SELECT, so FOR XML must be a subquery, and ORDER BY is a legal prefix in a FOR XML subquery ;)...
SELECT TOP 2 TABLE_NAME
, (SELECT ', ' + COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
WHERE COLUMNS.TABLE_NAME = TABLES.TABLE_NAME
ORDER BY ORDINAL_POSITION FOR XML PATH('')
) LIST_OF_COLUMNS
FROM INFORMATION_SCHEMA.TABLES
ORDER BY TABLE_NAME;
/*TABLE_NAME LIST_OF_COLUMNS
spt_fallback_db , xserver_name, xdttm_ins, xdttm_last_ins_upd, xfallback_dbid, name, dbid, status, version
spt_fallback_dev , xserver_name, xdttm_ins, xdttm_last_ins_upd, xfallback_low, xfallback_drive, low, high, status, name, phyname
*/
(4) Finally, SUBSTRING is more familiar to me than STUFF for removing a known prefix...
SELECT TOP 2 TABLE_NAME
, SUBSTRING((SELECT ', ' + COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
WHERE COLUMNS.TABLE_NAME = TABLES.TABLE_NAME
ORDER BY ORDINAL_POSITION FOR XML PATH('')
)
, 2+1--Add 1 to start substring after the first 2 characters
, 99999) LIST_OF_COLUMNS
FROM INFORMATION_SCHEMA.TABLES
ORDER BY TABLE_NAME;
/*TABLE_NAME LIST_OF_COLUMNS
spt_fallback_db xserver_name, xdttm_ins, xdttm_last_ins_upd, xfallback_dbid, name, dbid, status, version
spt_fallback_dev xserver_name, xdttm_ins, xdttm_last_ins_upd, xfallback_low, xfallback_drive, low, high, status, name, phyname
*/
Pedantically yours - Jim Gettman

Set operation on TSQL (SQL 2005/2008)

When a set is given say {1,2,3,4,5,6}
The task is to separe pair of subsets
{1,2},
{1,3},
{1,4},
{1,5},
{1,6},
{2,3},
{2,4},
{2,5},
{2,6},
{3,4},
{3,5},
{3,6},
{4,5},
{5,6}
So when i have a table
Table Element
1
2
3
4
5
6
What is the way to list out all possible pair of comma separated subset ?
(Duplicates can be ignored (i.e) {1,2} is identical to {2,1})
SELECT T1.elem, T2.elem
FROM MyTable T1
INNER JOIN MyTable T2
ON T2.elem > T1.elem
...gets you most of the way there - if you want these shown as sets then...
SELECT '{' + CAST(T1.elem AS VARCHAR(12)) + ', ' + CAST(T2.elem AS VARCHAR(12)) + '}'
FROM MyTable T1
INNER JOIN MyTable T2
ON T2.elem > T1.elem
...is what you're after.
Here is a solution to the problem using a CTE. It isn’t particularly elegant, but it gets the job done.
DECLARE #set TABLE (Element INT);
INSERT INTO #set(Element) VALUES (1);
INSERT INTO #set(Element) VALUES (2);
INSERT INTO #set(Element) VALUES (3);
INSERT INTO #set(Element) VALUES (4);
INSERT INTO #set(Element) VALUES (5);
INSERT INTO #set(Element) VALUES (6);
;WITH array (Element1, Element2, Row)
AS
(
SELECT t.Element
, t2.Element
, ROW_NUMBER() OVER(ORDER BY t.Element)
FROM #set AS t
CROSS JOIN #set AS t2
WHERE t.Element <> t2.Element
)
SELECT a.Element1
, a.Element2
, '{' + CONVERT(VARCHAR(5),a.Element1) + ',' + CONVERT(VARCHAR(5),a.Element2) + '}' AS 'Subset'
FROM array AS a
WHERE NOT EXISTS (SELECT *
FROM array AS sa
WHERE sa.Element1 = a.Element2
AND sa.Element2 = a.Element1
AND sa.Row < a.Row
);