SQL Pivot to PGSQL crosstab-->Data not matching - postgresql

Have following code in SQL,Please help.Iam Stuck on this pivot.
SELECT #QStr = COALESCE(#QStr,'')+',['+ColtoRow+']' FROM(
SELECT DISTINCT PeriodEndDatDisp ColtoRow
from #Esti
)A GROUP BY ColtoRow order by ColtoRow
SELECT #QStr=STUFF(#QStr,1,1,'')
SELECT #Query = '
SELECT GroupID,GroupName,Mes_Type,ParentProductId,ParentProductName,Meas_Nam,fn_ProperCase(SegmentType) SegmentType,SegMeasType,
CurrencyId,ValType,IsPerSha,Mes_Order,ShowSegData,' +#QStr+ '
FROM (
SELECT GroupID,GroupName,Mes_Type,ParentProductId,ParentProductName,Meas_Nam,SegmentType,SegMeasType,EstValue,CurrencyId,ValType,Mes_Order,
ShowSegData,IsPerSha,
PeriodEndDatDisp ColtoRow
FROM #Esti
) AS Src_Table PIVOT
(
MAX(EstValue) FOR ColtoRow IN (' +#QStr+' )
) AS PivotTable ORDER BY GroupID,Mes_Order,Meas_Nam,SegmentType,SegMeasType,ValType;'
EXECUTE #Query;
the Equivalent code in PGSQL is
SELECT string_agg(DISTINCT PeriodEndDateDisplay,',') ColtoRow
from t$Estimate
--o/p FY-2015,FY-2016,Q1-2015,Q1-2016,Q2-2015,Q2-2016,Q3-2015,Q3-2016,Q4-2015,Q4-2016
SELECT *
FROM crosstab(
'SELECT
GroupID,GroupName,MeasureType,ParentProductId,ParentProductName,
MeasureName,est.fn_ProperCase(SegmentType) SegmentType,SegmentMeasureType, EstValue
,CurrencyId,ValType,MeasureOrder,IsPerShare,ShowSegmentData
FROM est.t$estimate'
, $$ SELECT unnest('{Q2-2016,Q4-2015,Q1-2015,Q3-2016,Q4-2016,FY-2016,Q1-2016,Q3-2015,Q2-2015,FY-2015}'::text[])$$
) AS ct ( groupid integer, groupname character varying(500) , measuretype character varying(100) , parentproductid character varying(100) ,
parentproductname character varying(200), measurename character varying(200) , segmenttype character varying ,
segmentmeasuretype character varying(20), EstValue text, currencyid character varying(3), valtype integer,
measureorder integer, ispershare boolean,FY2016 text,
Q1-2015 text,Q1-2016 text,Q2-2015 text,Q2-2016 text,Q3-2015 text,Q3-2016 text,Q4-2015 text,Q4-2016 text);
Problem encountered:
Only 1 row is populated and the quarter values are all null. In MSSQL its 25 rows
hyphen(-)creating issue in Quater time period value eg:Q4-2015

Related

pivot or reshapre sql [duplicate]

I've been tasked with coming up with a means of translating the following data:
date category amount
1/1/2012 ABC 1000.00
2/1/2012 DEF 500.00
2/1/2012 GHI 800.00
2/10/2012 DEF 700.00
3/1/2012 ABC 1100.00
into the following:
date ABC DEF GHI
1/1/2012 1000.00
2/1/2012 500.00
2/1/2012 800.00
2/10/2012 700.00
3/1/2012 1100.00
The blank spots can be NULLs or blanks, either is fine, and the categories would need to be dynamic. Another possible caveat to this is that we'll be running the query in a limited capacity, which means temp tables are out. I've tried to research and have landed on PIVOT but as I've never used that before I really don't understand it, despite my best efforts to figure it out. Can anyone point me in the right direction?
Dynamic SQL PIVOT:
create table temp
(
date datetime,
category varchar(3),
amount money
)
insert into temp values ('1/1/2012', 'ABC', 1000.00)
insert into temp values ('2/1/2012', 'DEF', 500.00)
insert into temp values ('2/1/2012', 'GHI', 800.00)
insert into temp values ('2/10/2012', 'DEF', 700.00)
insert into temp values ('3/1/2012', 'ABC', 1100.00)
DECLARE #cols AS NVARCHAR(MAX),
#query AS NVARCHAR(MAX);
SET #cols = STUFF((SELECT distinct ',' + QUOTENAME(c.category)
FROM temp c
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
,1,1,'')
set #query = 'SELECT date, ' + #cols + ' from
(
select date
, amount
, category
from temp
) x
pivot
(
max(amount)
for category in (' + #cols + ')
) p '
execute(#query)
drop table temp
Results:
Date ABC DEF GHI
2012-01-01 00:00:00.000 1000.00 NULL NULL
2012-02-01 00:00:00.000 NULL 500.00 800.00
2012-02-10 00:00:00.000 NULL 700.00 NULL
2012-03-01 00:00:00.000 1100.00 NULL NULL
Dynamic SQL PIVOT
Different approach for creating columns string
create table #temp
(
date datetime,
category varchar(3),
amount money
)
insert into #temp values ('1/1/2012', 'ABC', 1000.00)
insert into #temp values ('2/1/2012', 'DEF', 500.00)
insert into #temp values ('2/1/2012', 'GHI', 800.00)
insert into #temp values ('2/10/2012', 'DEF', 700.00)
insert into #temp values ('3/1/2012', 'ABC', 1100.00)
DECLARE #cols AS NVARCHAR(MAX)='';
DECLARE #query AS NVARCHAR(MAX)='';
SELECT #cols = #cols + QUOTENAME(category) + ',' FROM (select distinct category from #temp ) as tmp
select #cols = substring(#cols, 0, len(#cols)) --trim "," at end
set #query =
'SELECT * from
(
select date, amount, category from #temp
) src
pivot
(
max(amount) for category in (' + #cols + ')
) piv'
execute(#query)
drop table #temp
Result
date ABC DEF GHI
2012-01-01 00:00:00.000 1000.00 NULL NULL
2012-02-01 00:00:00.000 NULL 500.00 800.00
2012-02-10 00:00:00.000 NULL 700.00 NULL
2012-03-01 00:00:00.000 1100.00 NULL NULL
I know this question is older but I was looking thru the answers and thought that I might be able to expand on the "dynamic" portion of the problem and possibly help someone out.
First and foremost I built this solution to solve a problem a couple of coworkers were having with inconstant and large data sets needing to be pivoted quickly.
This solution requires the creation of a stored procedure so if that is out of the question for your needs please stop reading now.
This procedure is going to take in the key variables of a pivot statement to dynamically create pivot statements for varying tables, column names and aggregates. The Static column is used as the group by / identity column for the pivot(this can be stripped out of the code if not necessary but is pretty common in pivot statements and was necessary to solve the original issue), the pivot column is where the end resultant column names will be generated from, and the value column is what the aggregate will be applied to. The Table parameter is the name of the table including the schema (schema.tablename) this portion of the code could use some love because it is not as clean as I would like it to be. It worked for me because my usage was not publicly facing and sql injection was not a concern. The Aggregate parameter will accept any standard sql aggregate 'AVG', 'SUM', 'MAX' etc. The code also defaults to MAX as an aggregate this is not necessary but the audience this was originally built for did not understand pivots and were typically using max as an aggregate.
Lets start with the code to create the stored procedure. This code should work in all versions of SSMS 2005 and above but I have not tested it in 2005 or 2016 but I can not see why it would not work.
create PROCEDURE [dbo].[USP_DYNAMIC_PIVOT]
(
#STATIC_COLUMN VARCHAR(255),
#PIVOT_COLUMN VARCHAR(255),
#VALUE_COLUMN VARCHAR(255),
#TABLE VARCHAR(255),
#AGGREGATE VARCHAR(20) = null
)
AS
BEGIN
SET NOCOUNT ON;
declare #AVAIABLE_TO_PIVOT NVARCHAR(MAX),
#SQLSTRING NVARCHAR(MAX),
#PIVOT_SQL_STRING NVARCHAR(MAX),
#TEMPVARCOLUMNS NVARCHAR(MAX),
#TABLESQL NVARCHAR(MAX)
if isnull(#AGGREGATE,'') = ''
begin
SET #AGGREGATE = 'MAX'
end
SET #PIVOT_SQL_STRING = 'SELECT top 1 STUFF((SELECT distinct '', '' + CAST(''[''+CONVERT(VARCHAR,'+ #PIVOT_COLUMN+')+'']'' AS VARCHAR(50)) [text()]
FROM '+#TABLE+'
WHERE ISNULL('+#PIVOT_COLUMN+','''') <> ''''
FOR XML PATH(''''), TYPE)
.value(''.'',''NVARCHAR(MAX)''),1,2,'' '') as PIVOT_VALUES
from '+#TABLE+' ma
ORDER BY ' + #PIVOT_COLUMN + ''
declare #TAB AS TABLE(COL NVARCHAR(MAX) )
INSERT INTO #TAB EXEC SP_EXECUTESQL #PIVOT_SQL_STRING, #AVAIABLE_TO_PIVOT
SET #AVAIABLE_TO_PIVOT = (SELECT * FROM #TAB)
SET #TEMPVARCOLUMNS = (SELECT replace(#AVAIABLE_TO_PIVOT,',',' nvarchar(255) null,') + ' nvarchar(255) null')
SET #SQLSTRING = 'DECLARE #RETURN_TABLE TABLE ('+#STATIC_COLUMN+' NVARCHAR(255) NULL,'+#TEMPVARCOLUMNS+')
INSERT INTO #RETURN_TABLE('+#STATIC_COLUMN+','+#AVAIABLE_TO_PIVOT+')
select * from (
SELECT ' + #STATIC_COLUMN + ' , ' + #PIVOT_COLUMN + ', ' + #VALUE_COLUMN + ' FROM '+#TABLE+' ) a
PIVOT
(
'+#AGGREGATE+'('+#VALUE_COLUMN+')
FOR '+#PIVOT_COLUMN+' IN ('+#AVAIABLE_TO_PIVOT+')
) piv
SELECT * FROM #RETURN_TABLE'
EXEC SP_EXECUTESQL #SQLSTRING
END
Next we will get our data ready for the example. I have taken the data example from the accepted answer with the addition of a couple of data elements to use in this proof of concept to show the varied outputs of the aggregate change.
create table temp
(
date datetime,
category varchar(3),
amount money
)
insert into temp values ('1/1/2012', 'ABC', 1000.00)
insert into temp values ('1/1/2012', 'ABC', 2000.00) -- added
insert into temp values ('2/1/2012', 'DEF', 500.00)
insert into temp values ('2/1/2012', 'DEF', 1500.00) -- added
insert into temp values ('2/1/2012', 'GHI', 800.00)
insert into temp values ('2/10/2012', 'DEF', 700.00)
insert into temp values ('2/10/2012', 'DEF', 800.00) -- addded
insert into temp values ('3/1/2012', 'ABC', 1100.00)
The following examples show the varied execution statements showing the varied aggregates as a simple example. I did not opt to change the static, pivot, and value columns to keep the example simple. You should be able to just copy and paste the code to start messing with it yourself
exec [dbo].[USP_DYNAMIC_PIVOT] 'date','category','amount','dbo.temp','sum'
exec [dbo].[USP_DYNAMIC_PIVOT] 'date','category','amount','dbo.temp','max'
exec [dbo].[USP_DYNAMIC_PIVOT] 'date','category','amount','dbo.temp','avg'
exec [dbo].[USP_DYNAMIC_PIVOT] 'date','category','amount','dbo.temp','min'
This execution returns the following data sets respectively.
Updated version for SQL Server 2017 using STRING_AGG function to construct the pivot column list:
create table temp
(
date datetime,
category varchar(3),
amount money
);
insert into temp values ('20120101', 'ABC', 1000.00);
insert into temp values ('20120201', 'DEF', 500.00);
insert into temp values ('20120201', 'GHI', 800.00);
insert into temp values ('20120210', 'DEF', 700.00);
insert into temp values ('20120301', 'ABC', 1100.00);
DECLARE #cols AS NVARCHAR(MAX),
#query AS NVARCHAR(MAX);
SET #cols = (SELECT STRING_AGG(category,',') FROM (SELECT DISTINCT category FROM temp WHERE category IS NOT NULL)t);
set #query = 'SELECT date, ' + #cols + ' from
(
select date
, amount
, category
from temp
) x
pivot
(
max(amount)
for category in (' + #cols + ')
) p ';
execute(#query);
drop table temp;
There's my solution cleaning up the unnecesary null values
DECLARE #cols AS NVARCHAR(MAX),
#maxcols AS NVARCHAR(MAX),
#query AS NVARCHAR(MAX)
select #cols = STUFF((SELECT ',' + QUOTENAME(CodigoFormaPago)
from PO_FormasPago
order by CodigoFormaPago
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
,1,1,'')
select #maxcols = STUFF((SELECT ',MAX(' + QUOTENAME(CodigoFormaPago) + ') as ' + QUOTENAME(CodigoFormaPago)
from PO_FormasPago
order by CodigoFormaPago
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
,1,1,'')
set #query = 'SELECT CodigoProducto, DenominacionProducto, ' + #maxcols + '
FROM
(
SELECT
CodigoProducto, DenominacionProducto,
' + #cols + ' from
(
SELECT
p.CodigoProducto as CodigoProducto,
p.DenominacionProducto as DenominacionProducto,
fpp.CantidadCuotas as CantidadCuotas,
fpp.IdFormaPago as IdFormaPago,
fp.CodigoFormaPago as CodigoFormaPago
FROM
PR_Producto p
LEFT JOIN PR_FormasPagoProducto fpp
ON fpp.IdProducto = p.IdProducto
LEFT JOIN PO_FormasPago fp
ON fpp.IdFormaPago = fp.IdFormaPago
) xp
pivot
(
MAX(CantidadCuotas)
for CodigoFormaPago in (' + #cols + ')
) p
) xx
GROUP BY CodigoProducto, DenominacionProducto'
t #query;
execute(#query);
The below code provides the results which replaces NULL to zero in the output.
Table creation and data insertion:
create table test_table
(
date nvarchar(10),
category char(3),
amount money
)
insert into test_table values ('1/1/2012','ABC',1000.00)
insert into test_table values ('2/1/2012','DEF',500.00)
insert into test_table values ('2/1/2012','GHI',800.00)
insert into test_table values ('2/10/2012','DEF',700.00)
insert into test_table values ('3/1/2012','ABC',1100.00)
Query to generate the exact results which also replaces NULL with zeros:
DECLARE #DynamicPivotQuery AS NVARCHAR(MAX),
#PivotColumnNames AS NVARCHAR(MAX),
#PivotSelectColumnNames AS NVARCHAR(MAX)
--Get distinct values of the PIVOT Column
SELECT #PivotColumnNames= ISNULL(#PivotColumnNames + ',','')
+ QUOTENAME(category)
FROM (SELECT DISTINCT category FROM test_table) AS cat
--Get distinct values of the PIVOT Column with isnull
SELECT #PivotSelectColumnNames
= ISNULL(#PivotSelectColumnNames + ',','')
+ 'ISNULL(' + QUOTENAME(category) + ', 0) AS '
+ QUOTENAME(category)
FROM (SELECT DISTINCT category FROM test_table) AS cat
--Prepare the PIVOT query using the dynamic
SET #DynamicPivotQuery =
N'SELECT date, ' + #PivotSelectColumnNames + '
FROM test_table
pivot(sum(amount) for category in (' + #PivotColumnNames + ')) as pvt';
--Execute the Dynamic Pivot Query
EXEC sp_executesql #DynamicPivotQuery
OUTPUT :
A version of Taryn's answer with performance improvements:
Data
CREATE TABLE dbo.Temp
(
[date] datetime NOT NULL,
category nchar(3) NOT NULL,
amount money NOT NULL,
INDEX [CX dbo.Temp date] CLUSTERED ([date]),
INDEX [IX dbo.Temp category] NONCLUSTERED (category)
);
INSERT dbo.Temp
([date], category, amount)
VALUES
({D '2012-01-01'}, N'ABC', $1000.00),
({D '2012-01-02'}, N'DEF', $500.00),
({D '2012-01-02'}, N'GHI', $800.00),
({D '2012-02-10'}, N'DEF', $700.00),
({D '2012-03-01'}, N'ABC', $1100.00);
Dynamic pivot
DECLARE
#Delimiter nvarchar(4000) = N',',
#DelimiterLength bigint,
#Columns nvarchar(max),
#Query nvarchar(max);
SET #DelimiterLength = LEN(REPLACE(#Delimiter, SPACE(1), N'#'));
-- Before SQL Server 2017
SET #Columns =
STUFF
(
(
SELECT
[text()] = #Delimiter,
[text()] = QUOTENAME(T.category)
FROM dbo.Temp AS T
WHERE T.category IS NOT NULL
GROUP BY T.category
ORDER BY T.category
FOR XML PATH (''), TYPE
)
.value(N'text()[1]', N'nvarchar(max)'),
1, #DelimiterLength, SPACE(0)
);
-- Alternative for SQL Server 2017+ and database compatibility level 110+
SELECT #Columns =
STRING_AGG(CONVERT(nvarchar(max), QUOTENAME(T.category)), N',')
WITHIN GROUP (ORDER BY T.category)
FROM
(
SELECT T2.category
FROM dbo.Temp AS T2
WHERE T2.category IS NOT NULL
GROUP BY T2.category
) AS T;
IF #Columns IS NOT NULL
BEGIN
SET #Query =
N'SELECT [date], ' +
#Columns +
N'
FROM
(
SELECT [date], amount, category
FROM dbo.Temp
) AS S
PIVOT
(
MAX(amount)
FOR category IN (' +
#Columns +
N')
) AS P;';
EXECUTE sys.sp_executesql #Query;
END;
Execution plans
Results
date
ABC
DEF
GHI
2012-01-01 00:00:00.000
1000.00
NULL
NULL
2012-01-02 00:00:00.000
NULL
500.00
800.00
2012-02-10 00:00:00.000
NULL
700.00
NULL
2012-03-01 00:00:00.000
1100.00
NULL
NULL
CREATE TABLE #PivotExample(
[ID] [nvarchar](50) NULL,
[Description] [nvarchar](50) NULL,
[ClientId] [smallint] NOT NULL,
)
GO
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI1','ACI1Desc1',1008)
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI1','ACI1Desc2',2000)
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI1','ACI1Desc3',3000)
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI1','ACI1Desc4',4000)
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI2','ACI2Desc1',5000)
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI2','ACI2Desc2',6000)
INSERT #PivotExample ([ID],[Description], [ClientId]) VALUES ('ACI2','ACI2Desc3', 7000)
SELECT * FROM #PivotExample
--Declare necessary variables
DECLARE #SQLQuery AS NVARCHAR(MAX)
DECLARE #PivotColumns AS NVARCHAR(MAX)
--Get unique values of pivot column
SELECT #PivotColumns= COALESCE(#PivotColumns + ',','') + QUOTENAME([Description])
FROM (SELECT DISTINCT [Description] FROM [dbo].#PivotExample) AS PivotExample
--SELECT #PivotColumns
--Create the dynamic query with all the values for
--pivot column at runtime
SET #SQLQuery =
N' -- Your pivoted result comes here
SELECT ID, ' + #PivotColumns + '
FROM
(
-- Source table should in a inner query
SELECT ID,[Description],[ClientId]
FROM #PivotExample
)AS P
PIVOT
(
-- Select the values from derived table P
SUM(ClientId)
FOR [Description] IN (' + #PivotColumns + ')
)AS PVTTable'
--SELECT #SQLQuery
--Execute dynamic query
EXEC sp_executesql #SQLQuery
Drop table #PivotExample
Fully generic way that will work in non-traditional MS SQL environments (e.g. Azure Synapse Analytics Serverless SQL Pools) - it's in a SPROC but no need to use as such...
-- DROP PROCEDURE IF EXISTS
if object_id('dbo.usp_generic_pivot') is not null
DROP PROCEDURE dbo.usp_generic_pivot
GO;
CREATE PROCEDURE dbo.usp_generic_pivot (
#source NVARCHAR (100), -- table or view object name
#pivotCol NVARCHAR (100), -- the column to pivot
#pivotAggCol NVARCHAR (100), -- the column with the values for the pivot
#pivotAggFunc NVARCHAR (20), -- the aggregate function to apply to those values
#leadCols NVARCHAR (100) -- comma seprated list of other columns to keep and order by
)
AS
BEGIN
DECLARE #pivotedColumns NVARCHAR(MAX)
DECLARE #tsql NVARCHAR(MAX)
SET #tsql = CONCAT('SELECT #pivotedColumns = STRING_AGG(qname, '','') FROM (SELECT DISTINCT QUOTENAME(', #pivotCol,') AS qname FROM ',#source, ') AS qnames')
EXEC sp_executesql #tsql, N'#pivotedColumns nvarchar(max) out', #pivotedColumns out
SET #tsql = CONCAT ( 'SELECT ', #leadCols, ',', #pivotedColumns,' FROM ',' ( SELECT ',#leadCols,',',
#pivotAggCol,',', #pivotCol, ' FROM ', #source, ') as t ',
' PIVOT (', #pivotAggFunc, '(', #pivotAggCol, ')',' FOR ', #pivotCol,
' IN (', #pivotedColumns,')) as pvt ',' ORDER BY ', #leadCols)
EXEC (#tsql)
END
GO;
-- TEST EXAMPLE
EXEC dbo.usp_generic_pivot
#source = '[your_db].[dbo].[form_answers]',
#pivotCol = 'question',
#pivotAggCol = 'answer',
#pivotAggFunc = 'MAX',
#leadCols = 'candidate_id, candidate_name'
GO;

Update column using dblink

I am using below reference to update 2 columns (customer name, service) at table cust_eq_memory_dy.
loopback at table msrouterlistfinal2 will match address at cust_eq_memory_dy.
Can someone help me on this as I got the syntax error at or near "FROM"?
Update between 2 databases using dblink not working
UPDATE cust_eq_memory_dy B
SET customername = A.customername
WHERE B.ipaddress = A.loopbackip
FROM (
SELECT *
FROM DBLINK ( 'host= 10.X.80.160 user=123 password=123 dbname=postgres',
'select customername, serviceid, loopbackip FROM msrouterlistfinal2 ')
as temp (
customername character varying (100),
serviceid character varying (50),
loopbackip character varying (30) )
)A
if you are using postgres I highly recommend you to use the WITH sentence.
WITH A as ( SELECT * FROM DBLINK ( 'host= 10.X.80.160 user=123 password=123 dbname=postgres', 'select customername, serviceid, loopbackip FROM msrouterlistfinal2 ') as temp ( customername character varying (100), serviceid character varying (50), loopbackip character varying (30) ) )
UPDATE cust_eq_memory_dy B SET customername = (SELECT A.customername FROM A WHERE B.ipaddress = A.loopbackip);
Check this link for more information.
https://www.postgresql.org/docs/8.4/static/queries-with.html

Percentage of Values for Top 3 from a Character Field

I have an unusual situation. Please consider the following code:
IF OBJECT_ID('tempdb..#CharacterTest') IS NOT NULL
DROP TABLE #CharacterTest
CREATE TABLE #CharacterTest
(
[ID] int IDENTITY(1, 1) NOT NULL,
[CharField] varchar(50) NULL
)
INSERT INTO #CharacterTest (CharField)
VALUES ('A')
, ('A')
, ('A')
, ('A')
, ('B')
, ('B')
, ('B')
, ('C')
, ('C')
, ('D')
, ('D')
, ('F')
, ('G')
, ('H')
, ('I')
, ('J')
, ('K')
, ('L')
, ('M')
, ('N')
, (' ')
, (' ')
, (' ')
, (NULL)
, ('');
I would like a query which gives me a character string like this:
A (16%), B (12%), C(8%)
Please notice the following:
I don't want to have empty strings, strings with all blanks, or nulls listed in the top 3, but I do want the percentage of values calculated using the entire record count for the table.
Ties can be ignored, so if there were 22 values in the list with 8% frequency, it's alright to simply return whichever one is first.
Percentages can be rounded to whole numbers.
I'd like to find the easiest way to write this query while still retaining T-SQL compatibility back to SQL Server 2005. What is the best way to do this? Window Functions?
I'd go for.
WITH T1
AS (SELECT [CharField],
100.0 * COUNT(*) OVER (PARTITION BY [CharField]) /
COUNT(*) OVER () AS Pct
FROM #CharacterTest),
T2
AS (SELECT DISTINCT TOP 3 *
FROM T1
WHERE [CharField] <> '' --Excludes all blank or NULL as well
ORDER BY Pct DESC)
SELECT STUFF((SELECT ',' + [CharField] + ' (' + CAST(CAST(ROUND(Pct,1) AS INT) AS VARCHAR(3)) + ')'
FROM T2
ORDER BY Pct DESC
FOR XML PATH('')), 1, 1, '') AS Result
My first attempt would probably be this. Not saying that it's the best way to handle it, but that it would work.
DECLARE #TotalCount INT
SELECT #TotalCount = COUNT(*) FROM #CharacterTest AS ct
SELECT TOP(3) CharField, COUNT(*) * 1.0 / #TotalCount AS OverallPercentage
FROM #CharacterTest AS ct
WHERE CharField IS NOT NULL AND REPLACE(CharField, ' ', '') <> ''
GROUP BY CharField
ORDER BY COUNT(*) desc
DROP TABLE #CharacterTest
This should get the character string you need:
declare #output varchar(200);
with cte as (
select CharField
, (count(*) * 100) / (select count(*) from #CharacterTest) as CharPct
, row_number() over (order by count(*) desc, CharField) as RowNum
from #CharacterTest
where replace(CharField, ' ', '') not like ''
group by CharField
)
select #output = coalesce(#output + ', ', '') + CharField + ' (' + cast(CharPct as varchar(11)) + '%)'
from cte
where RowNum <= 3
order by RowNum;
select #output;
-- Returns:
-- A (16%), B (12%), C (8%)
I would draw attention to storing a single character in a varchar(50) column, however.

How to approach data warehouse (PostgreSQL) documentation?

We do have a small data warehouse in PostgreSQL database and I have to document all the tables.
I thought I can add a comment to every column and table and use pipe "|" separator to add more attributes. Then I can use information schema and array function to get documentation and use any reporting software to create desired output.
select
ordinal_position,
column_name,
data_type,
character_maximum_length,
numeric_precision,
numeric_scale,
is_nullable,
column_default,
(string_to_array(descr.description,'|'))[1] as cs_name,
(string_to_array(descr.description,'|'))[2] as cs_description,
(string_to_array(descr.description,'|'))[3] as en_name,
(string_to_array(descr.description,'|'))[4] as en_description,
(string_to_array(descr.description,'|'))[5] as other
from
information_schema.columns columns
join pg_catalog.pg_class klass on (columns.table_name = klass.relname and klass.relkind = 'r')
left join pg_catalog.pg_description descr on (descr.objoid = klass.oid and descr.objsubid = columns.ordinal_position)
where
columns.table_schema = 'data_warehouse'
order by
columns.ordinal_position;
It is a good idea or is there better approach?
Unless you must include descriptions of the system tables, I wouldn't try to shoehorn your descriptions into pg_catalog.pg_description. Make your own table. That way you get to keep the columns as columns, and not have to use clunky string functions.
Alternatively, consider adding specially formatted comments to your master schema file, along the lines of javadoc. Then write a tool to extract those comments and create a document. That way the comments stay close to the thing they're commenting, and you don't have to mess with the database at all to produce the report. For example:
--* Used for authentication.
create table users
(
--* standard Rails-friendly primary key. Also an example of
--* a long comment placed before the item, rather than on the
--* the same line.
id serial primary key,
name text not null, --* Real name (hopefully)
login text not null, --* Name used for authentication
...
);
Your documentation tool reads the file, looks for the --* comments, figures out what comments go with what things, and produces some kind of report, e.g.:
table users: Used for authentication
id: standard Rails-friendly primary key. Also an example of a
long comment placed before the item, rather than on the same
line.
name: Real name
login: Name used for authentication
You might note that with appropriate comments, the master schema file itself is a pretty good report in its own right, and that perhaps nothing else is needed.
If anyone interested, here is what I've used for initial load for my small documentation project. Documentation is in two tables, one for describing tables and one for describing columns and constraints. I appreciate any feedback.
/* -- Initial Load - Tables */
drop table dw_description_table cascade;
create table dw_description_table (
table_description_key serial primary key,
physical_full_name character varying,
physical_schema_name character varying,
physical_table_name character varying,
Table_Type character varying, -- Fact Dimension ETL Transformation
Logical_Name_CS character varying,
Description_CS character varying,
Logical_Name_EN character varying,
Description_EN character varying,
ToDo character varying,
Table_Load_Type character varying, --Manually TruncateLoad AddNewRows
Known_Exclusions character varying,
Table_Clover_Script character varying
);
insert into dw_description_table (physical_full_name, physical_schema_name, physical_table_name) (
select
table_schema || '.' || table_name as physical_full_name,
table_schema,
table_name
from
information_schema.tables
where
table_name like 'dw%' or table_name like 'etl%'
)
/* -- Initial Load - Columns */
CREATE TABLE dw_description_column (
column_description_key serial,
table_description_key bigint,
physical_full_name text,
physical_schema_name character varying,
physical_table_name character varying,
physical_column_name character varying,
ordinal_position character varying,
column_default character varying,
is_nullable character varying,
data_type character varying,
logical_name_cs character varying,
description_cs character varying,
logical_name_en character varying,
description_en character varying,
derived_rule character varying,
todo character varying,
pk_name character varying,
fk_name character varying,
foreign_table_name character varying,
foreign_column_name character varying,
is_primary_key boolean,
is_foreign_key boolean,
CONSTRAINT dw_description_column_pkey PRIMARY KEY (column_description_key ),
CONSTRAINT fk_dw_description_table_key FOREIGN KEY (table_description_key)
REFERENCES dw_description_table (table_description_key) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
insert into dw_description_column (
table_description_key ,
physical_full_name ,
physical_schema_name ,
physical_table_name ,
physical_column_name ,
ordinal_position ,
column_default ,
is_nullable ,
data_type ,
logical_name_cs ,
description_cs ,
logical_name_en ,
description_en ,
derived_rule ,
todo ,
pk_name ,
fk_name ,
foreign_table_name ,
foreign_column_name ,
is_primary_key ,
is_foreign_key )
(
with
dw_constraints as (
SELECT
tc.constraint_name,
tc.constraint_schema || '.' || tc.table_name || '.' || kcu.column_name as physical_full_name,
tc.constraint_schema,
tc.table_name,
kcu.column_name,
ccu.table_name AS foreign_table_name,
ccu.column_name AS foreign_column_name,
TC.constraint_type
FROM
information_schema.table_constraints AS tc
JOIN information_schema.key_column_usage AS kcu ON (tc.constraint_name = kcu.constraint_name and tc.table_name = kcu.table_name)
JOIN information_schema.constraint_column_usage AS ccu ON ccu.constraint_name = tc.constraint_name
WHERE
constraint_type in ('PRIMARY KEY','FOREIGN KEY')
AND tc.constraint_schema = 'bizdata'
and (tc.table_name like 'dw%' or tc.table_name like 'etl%')
group by
tc.constraint_name,
tc.constraint_schema,
tc.table_name,
kcu.column_name,
ccu.table_name ,
ccu.column_name,
TC.constraint_type
)
select
dwdt.table_description_key,
col.table_schema || '.' || col.table_name || '.' || col.column_name as physical_full_name,
col.table_schema as physical_schema_name,
col.table_name as physical_table_name,
col.column_name as physical_column_name,
col.ordinal_position,
col.column_default,
col.is_nullable,
col.data_type,
null as Logical_Name_CS ,
null as Description_CS ,
null as Logical_Name_EN,
null as Description_EN ,
null as Derived_Rule ,
null as ToDo,
dwc1.constraint_name pk_name,
dwc2.constraint_name as fk_name,
dwc2.foreign_table_name,
dwc2.foreign_column_name,
case when dwc1.constraint_name is not null then true else false end as is_primary_key,
case when dwc2.constraint_name is not null then true else false end as foreign_key
from
information_schema.columns col
join dw_description_table dwdt on (col.table_schema || '.' || col.table_name = dwdt.physical_full_name )
left join dw_constraints dwc1 on ((col.table_schema || '.' || col.table_name || '.' || col.column_name) = dwc1.physical_full_name and dwc1.constraint_type = 'PRIMARY KEY')
left join dw_constraints dwc2 on ((col.table_schema || '.' || col.table_name || '.' || col.column_name) = dwc2.physical_full_name and dwc2.constraint_type = 'FOREIGN KEY')
where
col.table_name like 'dw%' or col.table_name like 'etl%'
)

in T-SQL, is it possible to find names of columns containing NULL in a given row (without knowing all column names)?

Is it possible in T-SQL to write a proper query reflecting this pseudo-code:
SELECT {primary_key}, {column_name}
FROM {table}
WHERE {any column_name value} is NULL
i.e. without referencing each column-name explicitly.
Sounds simple enough but I've searched pretty extensively and found nothing.
You have to use dynamic sql to solve that problem. I have demonstrated how it could be done.
With this sql you can pick a table and check the row with id = 1 for columns being null and primary keys. I included a test table at the bottom of the script. Code will not display anything if there is not primary keys and no columns being null.
DECLARE #table_name VARCHAR(20)
DECLARE #chosencolumn VARCHAR(20)
DECLARE #sqlstring VARCHAR(MAX)
DECLARE #sqlstring2 varchar(100)
DECLARE #text VARCHAR(8000)
DECLARE #t TABLE (col1 VARCHAR(30), dummy INT)
SET #table_name = 'test_table' -- replace with your tablename if you want
SET #chosencolumn = 'ID=1' -- replace with criteria for selected row
SELECT #sqlstring = COALESCE(#sqlstring, '') + 'UNION ALL SELECT '',''''NULL '''' '' + '''+t1.column_name+''', 1000 ordinal_position FROM ['+#table_name+'] WHERE [' +t1.column_name+ '] is null and ' +#chosencolumn+ ' '
FROM INFORMATION_SCHEMA.COLUMNS t1
LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE t2
ON t1.column_name = t2.column_name
AND t1.table_name = t2.table_name
AND t1.table_schema = t2.table_schema
WHERE t1.table_name = #table_name
AND t2.column_name is null
SET #sqlstring = stuff('UNION ALL SELECT '',''''PRIMARY KEY'''' ''+ column_name + '' '' col1, ordinal_position
FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
WHERE table_name = ''' + #table_name+ '''' + #sqlstring, 1, 10, '') + 'order by 2'
INSERT #t
EXEC( #sqlstring)
SELECT #text = COALESCE(#text, '') + col1
FROM #t
SET #sqlstring2 ='select '+stuff(#text,1,1,'')
EXEC( #sqlstring2)
Result:
id host_id date col1
PRIMARY KEY PRIMARY KEY PRIMARY KEY NULL
Test table
CREATE TABLE [dbo].[test_table](
[id] int not null,
[host_id] [int] NOT NULL,
[date] [datetime] NOT NULL,
[col1] [varchar](20) NULL,
[col2] [varchar](20) NULL,
CONSTRAINT [PK_test_table] PRIMARY KEY CLUSTERED
(
[id] ASC,
[host_id] ASC,
[date] ASC
))
Test data
INSERT test_table VALUES (1, 1, getdate(), null, 'somevalue')