TSQL full join with special formatting - tsql

I have a nice lady who wants specific formatting on output, and I see that it's very special though against common rules, can you advice if I can achive this somehow with TSQL. I pasted sample input and wanted output. This is I think kind of FULL JOIN but need do more cleaning , bit lost here. My first project. Tried to convince her use 2 separate tables for output to maintain some integrity but failed, so taking this a a challenge)! There is NO relationship between LineOrd and LinePr. I composed something with multiple steps but feel should be someting more elegant then doing 4-5 steps..
Best
DROP TABLE If exists #o
CREATE TABLE #o (OrderID int, LineOrd int, ProductID int)
INSERT #o VALUES
(111, 1, 10001),(2222, 1, 2001),(2222, 2, 2002),
(2222, 3, 2003),(3333, 1, 30001),(3333, 2, 30002)
DROP TABLE If exists #p
CREATE TABLE #p (OrderID int, LinePr int, ProfileID CHAR(10))
INSERT #p VALUES
(111, 1, 'alpha1'),(111, 2, 'bravo1'),
(2222, 1, 'charlie2'),(2222, 2, 'delta2'),
(3333, 1, 'echo3'),(3333, 2, 'hotel3')
-- select * from #o; select * from #p
SELECT
o.*, '--' f
,p.LinePR, p.ProfileID
,ROW_NUMBER() OVER (PARTITION BY o.orderID, o.lineOrd ORDER BY lineOrd) rn
,ROW_NUMBER() OVER (PARTITION BY o.orderID, p.linePr ORDER BY linePr ) rn2
,OrdMax = (SELECT MAX(lineOrd) FROM #o o2 WHERE o2.OrderID = o.OrderID)
,PrMax = (SELECT MAX(linePr) FROM #p p2 WHERE p2.OrderID = p.OrderID)
, 0 sw
FROM #o o
FULL JOIN #p p ON p.orderID = o.OrderID
ORDER BY 1,2,5
-- need more cleaning

You need FULL OUTER JOIN on both OrderID and LineOrd. And use COALESCE() to return first not null column of OrderID
SELECT OrderID = COALESCE(o.OrderID, p.OrderID),
o.LineOrd,
o.ProductID,
p.LinePr,
p.ProfileID
FROM #o o
FULL OUTER JOIN #p p ON p.OrderID = o.OrderID
AND p.LinePr = o.LineOrd
ORDER BY OrderID,
COALESCE(o.LineOrd, p.LinePr)

Related

TSQL - in a string, replace a character with a fixed one every 2 characters

I can't replace every 2 characters of a string with a '.'
select STUFF('abcdefghi', 3, 1, '.') c3,STUFF('abcdefghi', 5, 1,
'.') c5,STUFF('abcdefghi', 7, 1, '.') c7,STUFF('abcdefghi', 9, 1, '.')
c9
if I use STUFF I should subsequently overlap the strings c3, c5, c7 and c9. but I can't find a method
can you help me?
initial string:
abcdefghi
the result I would like is
ab.de.gh.
the string can be up to 50 characters
Create a numbers / tally / digits table, if you don't have one already, then you can use this to target each character position:
with digits as ( /* This would be a real table, here it's just to test */
select n from (values(1),(2),(3),(4),(5),(6),(7),(8),(9),(10))x(n)
), t as (
select 'abcdefghi' as s
)
select String_Agg( case when d.n%3 = 0 then '.' else Substring(t.s, d.n, 1) end, '')
from t
cross apply digits d
where d.n <Len(t.s)
Using for xml with existing table
with digits as (
select n from (values(1),(2),(3),(4),(5),(6),(7),(8),(9),(10))x(n)
),
r as (
select t.id, case when d.n%3=0 then '.' else Substring(t.s, d.n, 1) end ch
from t
cross apply digits d
where d.n <Len(t.s)
)
select result=(select '' + ch
from r r2
where r2.id=r.id
for xml path('')
)
from r
group by r.id
You can try it like this:
Easiest might be a quirky update ike here:
DECLARE #string VARCHAR(100)='abcdefghijklmnopqrstuvwxyz';
SELECT #string = STUFF(#string,3*A.pos,1,'.')
FROM (SELECT TOP(LEN(#string)/3) ROW_NUMBER() OVER(ORDER BY (SELECT NULL))
FROM master..spt_values) A(pos);
SELECT #string;
Better/Cleaner/Prettier was a recursive CTE:
We use a declared table to have some tabular sample data
DECLARE #tbl TABLE(ID INT IDENTITY, SomeString VARCHAR(200));
INSERT INTO #tbl VALUES('')
,('a')
,('ab')
,('abc')
,('abcd')
,('abcde')
,('abcdefghijklmnopqrstuvwxyz');
--the query
WITH recCTE AS
(
SELECT ID
,SomeString
,(LEN(SomeString)+1)/3 AS CountDots
,1 AS OccuranceOfDot
,SUBSTRING(SomeString,4,LEN(SomeString)) AS RestString
,CAST(LEFT(SomeString,2) AS VARCHAR(MAX)) AS Growing
FROM #tbl
UNION ALL
SELECT t.ID
,r.SomeString
,r.CountDots
,r.OccuranceOfDot+2
,SUBSTRING(RestString,4,LEN(RestString))
,CONCAT(Growing,'.',LEFT(r.RestString,2))
FROM #tbl t
INNER JOIN recCTE r ON t.ID=r.ID
WHERE r.OccuranceOfDot/2<r.CountDots-1
)
SELECT TOP 1 WITH TIES ID,Growing
FROM recCTE
ORDER BY ROW_NUMBER() OVER(PARTITION BY ID ORDER BY OccuranceOfDot DESC);
--the result
1
2 a
3 ab
4 ab
5 ab
6 ab.de
7 ab.de.gh.jk.mn.pq.st.vw.yz
The idea in short
We use a recursive CTE to walk along the string
we add the needed portion together with a dot
We stop, when the remaining length is to short to continue
a little magic is the ORDER BY ROW_NUMBER() OVER() together with TOP 1 WITH TIES. This will allow all first rows (frist per ID) to appear.

T-SQL how to join with one column a string and one an integer

How to join with one column a string and one an integer?
--PEOPLE_ID 000092437, PersonID 92437
select PC.PEOPLE_ID, Idn.PersonId,'Home Row 1', PC.Phone1 from #NextIdentityID Idn INNER JOIN PEOPLECHANGES PC on Idn.People_ID = PC.People_ID --PEOPLE_ID 000092437, PersonID 92437 one is varchar, one is integer
union all select PC.PEOPLE_ID, Idn.PersonId,'Office Row 2', PC.Phone2 from #NextIdentityID Idn INNER JOIN PEOPLECHANGES PC on Idn.People_ID = PC.People_ID
union all select PC.PEOPLE_ID, Idn.PersonId,'Cell Row 3', PC.Phone3 from #NextIdentityID Idn INNER JOIN PEOPLECHANGES PC on Idn.People_ID = PC.People_ID
To make sure your varchar() data doesn't raise any errors you should check to see if it can be converted into an integer. One way to do this is with a case statement in the where clause. If it is not convertible then your join won't work - but at least your query can still run with out error.
This example shows how you can avoid potential errors.
create table #tempa(id int, descr varchar(50));
create table #tempb(id varchar(10), descr varchar(50));
insert into #tempa(id,descr) values (1234,'Body getta body getta');
insert into #tempb(id,descr) values ('001234','sis boom ba - rah rah rah');
insert into #tempa(id,descr) values (5678,'Weagle Weagle War Damn Eagle');
insert into #tempb(id,descr) values ('0005678','Kickem in the butt Big blue');
insert into #tempa(id,descr) values (9012,'this wont have a match');
insert into #tempb(id,descr) values ('x0912','sis boom ba');
Select a.id as a_id, b.id as b_id
,a.descr as a_descr, b.descr as b_descr
from #tempa a
left join #tempb b
on a.id = case when isnumeric(b.id) = 1 then cast(b.id as int) else 0 end
-- this one will raise an error
Select a.id as a_id, b.id as b_id
,a.descr as a_descr, b.descr as b_descr
from #tempa a
left join #tempb b
on a.id = b.id
drop table #tempa;
drop table #tempb;
If you convert the one with leading zeros to an integer you will get equal values:
SELECT CONVERT(INT, '000092437') = 92437
However, this assumes that all of your varchar column can be convert to int.
If that's not the case then you have to write a function to go the other way and add leading zeros.

Ranking a record based on sort order of multiple related records in T-SQL?

I have two tables, plus a matching table. For argument's sake, let's call them Recipes and Ingredients. Each Recipe should have at least one Ingredient, but may have many. Each Ingredient can be used in many Recipes.
Recipes Ingredients Match
=============== =============== ===============
ID int ID int RecipeID int
Name varchar Name varchar IngredientID int
Sample data:
Recipes Ingredients Match (shown as CDL but stored as above)
=============== =============== ===============
Soup Chicken Soup: Chicken, Tomatoes
Pizza Tomatoes Pizza: Cheese, Chicken, Tomatoes
Chicken Sandwich Cheese C. Sandwich: Bread, Chicken, Tomatoes
Turkey Sandwich Bread T. Sandwich: Bread, Cheese, Tomatoes, Turkey
Turkey
Here's the problem: I need to sort the Recipes based on the name(s) of their Ingredients. Given the above sample data, I would need this sort order for recipes:
Turkey Sandwich (First ingredient bread, then cheese)
Chicken Sandwich (First ingredient bread, then chicken)
Pizza (First ingredient cheese)
Soup (First ingredient chicken)
Ranking the recipes by the first ingredient is straightforward:
WITH recipesranked AS (
SELECT Recipes.ID, Recipes.Name, Recipes.Description,
ROW_NUMBER() OVER (ORDER BY Ingredients.Name) AS SortOrder
FROM
Recipes
LEFT JOIN Match ON Match.RecipeID = Recipes.ID
LEFT JOIN Ingredients ON Ingredients.ID = Match.IngredientID
)
SELECT ID, Name, Description, MIN(SortOrder)
FROM recipesranked
GROUP BY ID, Name, Description;
Beyond that, I'm stuck. In my example above, this almost works, but leaves the two sandwiches in an ambiguous order.
I have a feeling that the MIN(SortOrder) should be replaced by something else, maybe a correlated subquery looking for the non-existence of another record in the same CTE, but haven't figured out the details.
Any ideas?
(It is possible for a Recipe to have no ingredients. I don't care what order they come out in, but the end would be ideal. Not my main concern at this point.)
I'm using SQL Server 2008 R2.
Update: I added an SQL Fiddle for this and updated the example here to match:
http://sqlfiddle.com/#!3/38258/2
Update: I have a sneaking suspicion that if there is a solution, it involves a cross-join to compare every combination of Recipe/Ingredient against every other, then filtering that somehow.
I think this will give you what you want (based on your supplied Fiddle)
-- Show recipes ranked by all their ingredients alphabetically
WITH recipesranked AS (
SELECT Recipes.ID, Recipes.Name, SortedIngredients.SortOrder
FROM
Recipes
LEFT JOIN Match ON Match.RecipeID = Recipes.ID
LEFT JOIN
(
SELECT ID, Name, POWER(2.0, ROW_NUMBER() OVER (ORDER BY Name Desc)) As SortOrder
FROM Ingredients) AS SortedIngredients
ON SortedIngredients.ID = Match.IngredientID
)
SELECT ID, Name, SUM(SortOrder)
FROM recipesranked
GROUP BY ID, Name
-- Sort by sum of the ingredients. Since the first ingredient for both kinds
-- of sandwiches is Bread, this gives both of them the same sort order, but
-- we need Turkey Sandwiches to come out first between them because Cheese
-- is it's #2 sorted ingredient, but Chicken is the #2 ingredient for
-- Chicken sandwiches.
ORDER BY SUM(SortOrder) DESC;
It just uses POWER to ensure that the most significant ingredients get weighted first.
This will work for any number of recipes and up to 120 ingredients (in total)
Will not work if recipes contain duplicate ingredients, though you could filter those out if they could occur
Binary Flag version:
;with IngredientFlag( IngredientId, Flag )
as
(
select
i.id Ingredient
, POWER( 2, row_number() over ( order by i.Name desc ) - 1 )
from
Ingredients i
)
, RecipeRank( RecipeId, Rank )
as
(
select
m.RecipeID
, row_number() /* or rank() */ over ( order by SUM( flag.Flag ) desc )
from
Match m
inner join IngredientFlag flag
on m.IngredientID = flag.IngredientId
group by
m.RecipeID
)
select
RecipeId
, Name
, Rank
from
RecipeRank rr
inner join Recipes r
on rr.RecipeId = r.id
Str Concat version:
-- order the ingredients per recipe
;with RecipeIngredientOrdinal( RecipeId, IngredientId, Name, Ordinal )
as
(
select
m.RecipeID
, m.IngredientID
, i.Name
, Row_Number() over ( partition by m.RecipeId order by i.Name ) Ordinal
from
Match m
inner join Ingredients i
on m.IngredientID = i.id
)
-- get ingredient count per recipe
, RecipeIngredientCount( RecipeId, IngredientCount )
as
(
select
m.RecipeID
, count(1)
from
Match m
group by
m.RecipeID
)
-- recursively build concatenated ingredient list per recipe
-- (note this will return incomplete lists which is why I include
-- 'generational' in the name)
, GenerationalConcatenatedIngredientList( RecipeId, Ingredients, IngredientCount )
as
(
select
rio.RecipeID
, cast( rio.Name as varchar(max) )
, rio.Ordinal
from
RecipeIngredientOrdinal rio
where
rio.Ordinal = 1
union all
select
rio.RecipeID
, cil.Ingredients + rio.Name
, rio.Ordinal
from
RecipeIngredientOrdinal rio
inner join GenerationalConcatenatedIngredientList cil
on rio.RecipeID = cil.RecipeId and rio.Ordinal = cil.IngredientCount + 1
)
-- return row_number or rank ordered by the concatenated ingredients list
-- (don't need to return Ingredients but shown for demonstrative purposes)
, RecipeRankByIngredients( RecipeId, Rank, Ingredients )
as
(
select
cil.RecipeId
, row_number() over ( order by cil.Ingredients ) -- or rank()
, cil.Ingredients
from
GenerationalConcatenatedIngredientList cil
inner join RecipeIngredientCount ric
on cil.RecipeId = ric.RecipeId
-- don't forget to filter for only the completed ingredient lists
-- and ignore all intermediate values
and cil.IngredientCount = ric.IngredientCount
)
select * from RecipeRankByIngredients
This should get you what you need:
WITH recipesranked AS (
SELECT Recipes.ID, Recipes.Name, ROW_NUMBER() OVER (ORDER BY Ingredients.Name) AS SortOrder,
Rank () OVER (partition by Recipes.Name ORDER BY Ingredients.Name) as RankOrder
FROM
Recipes
LEFT JOIN Match ON Match.RecipeID = Recipes.ID
LEFT JOIN Ingredients ON Ingredients.ID = Match.IngredientID
)
SELECT ID, Name,SortOrder, RankOrder
FROM recipesranked
Where RankOrder = 1
ORDER BY SortOrder;
The only alternative way I can think of to do it, is to use dynamic sql to generate a pivot
This doesn't have the limitation on the number of ingredients that my alternative has, but doesn't exactly feel elegant!
DECLARE #MaxIngredients INT
SELECT #MaxIngredients = MAX(IngredientCount)
FROM
(
SELECT COUNT(*) AS IngredientCount
FROM Match
GROUP BY RecipeID
) A
DECLARE #COLUMNS nvarchar(max)
SELECT #COLUMNS = N'[1]'
DECLARE #COLUMN INT
SELECT #COLUMN = 2
WHILE (#COLUMN <= #MaxIngredients)
BEGIN
SELECT #COLUMNS = #COLUMNS + N',[' + CAST(#COLUMN AS varchar(19)) + N']', #COLUMN = #COLUMN + 1
END
DECLARE #SQL nvarchar(max)
SELECT #SQL =
N'WITH recipesranked as(
SELECT *
FROM
(
SELECT M.RecipeID,
ROW_NUMBER() OVER (PARTITION BY M.RecipeID ORDER BY I.SortOrder) AS IngredientIndex,
I.SortOrder
FROM Match M
LEFT
JOIN
(
SELECT *, ROW_NUMBER() OVER (ORDER BY Name) As SortOrder
FROM Ingredients
) I
ON I.ID = M.IngredientID
) AS SourceTable
PIVOT
(
MIN(SortOrder) --min here is just for the syntax, there will only be one value
FOR IngredientIndex IN (' + #COLUMNS + N')
) AS PivotTable)
SELECT R.Name
FROM RecipesRanked RR
JOIN Recipes R
ON RR.RecipeID = R.ID
ORDER BY ' + #COLUMNS
EXEC SP_EXECUTESQL #SQL
Create a function and use that.
CREATE FUNCTION GetIngredients(#RecipeName varchar(200))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #Ingredients VARCHAR(MAX)
SET #Ingredients=NULL
SELECT TOP 9999999
#Ingredients = COALESCE(#Ingredients + ', ', '') + Ingredients.Name
FROM Recipes
LEFT JOIN Match ON Match.RecipeID = Recipes.ID
LEFT JOIN Ingredients ON Ingredients.ID = Match.IngredientID
WHERE Recipes.Name=#RecipeName
ORDER BY Ingredients.Name ASC
return #Ingredients
END
GO
SELECT
Recipes.Name AS RecipeName, dbo.GetIngredients(Recipes.Name) [Ingredients]
FROM Recipes
ORDER BY [Ingredients]

Query to get row from one table, else random row from another

tblUserProfile - I have a table which holds all the Profile Info (too many fields)
tblMonthlyProfiles - Another table which has just the ProfileID in it (the idea is that this table holds 2 profileids which sometimes become monthly profiles (on selection))
Now when I need to show monthly profiles, I simply do a select from this tblMonthlyProfiles and Join with tblUserProfile to get all valid info.
If there are no rows in tblMonthlyProfile, then monthly profile section is not displayed.
Now the requirement is to ALWAYS show Monthly Profiles. If there are no rows in monthlyProfiles, it should pick up 2 random profiles from tblUserProfile. If there is only one row in monthlyProfiles, it should pick up only one random row from tblUserProfile.
What is the best way to do all this in one single query ?
I thought something like this
select top 2 * from tblUserProfile P
LEFT OUTER JOIN tblMonthlyProfiles M
on M.profileid = P.profileid
ORder by NEWID()
But this always gives me 2 random rows from tblProfile. How can I solve this ?
Try something like this:
SELECT TOP 2 Field1, Field2, Field3, FinalOrder FROM
(
select top 2 Field1, Field2, Field3, FinalOrder, '1' As FinalOrder from tblUserProfile P JOIN tblMonthlyProfiles M on M.profileid = P.profileid
UNION
select top 2 Field1, Field2, Field3, FinalOrder, '2' AS FinalOrder from tblUserProfile P LEFT OUTER JOIN tblMonthlyProfiles M on M.profileid = P.profileid ORDER BY NEWID()
)
ORDER BY FinalOrder
The idea being to pick two monthly profiles (if that many exist) and then 2 random profiles (as you correctly did) and then UNION them. You'll have between 2 and 4 records at that point. Grab the top two. FinalOrder column is an easy way to make sure that you try and get the monthly's first.
If you have control of the table structure, you might save yourself some trouble by simply adding a boolean field IsMonthlyProfile to the UserProfile table. Then it's a single table query, order by IsBoolean, NewID()
In SQL 2000+ compliant syntax you could do something like:
Select ...
From (
Select TOP 2 ...
From tblUserProfile As UP
Where Not Exists( Select 1 From tblMonthlyProfile As MP1 )
Order By NewId()
) As RandomProfile
Union All
Select MP....
From tblUserProfile As UP
Join tblMonthlyProfile As MP
On MP.ProfileId = UP.ProfileId
Where ( Select Count(*) From tblMonthlyProfile As MP1 ) >= 1
Union All
Select ...
From (
Select TOP 1 ...
From tblUserProfile As UP
Where ( Select Count(*) From tblMonthlyProfile As MP1 ) = 1
Order By NewId()
) As RandomProfile
Using SQL 2005+ CTE you can do:
With
TwoRandomProfiles As
(
Select TOP 2 ..., ROW_NUMBER() OVER ( ORDER BY UP.ProfileID ) As Num
From tblUserProfile As UP
Order By NewId()
)
Select MP.Col1, ...
From tblUserProfile As UP
Join tblMonthlyProfile As MP
On MP.ProfileId = UP.ProfileId
Where ( Select Count(*) From tblMonthlyProfile As MP1 ) >= 1
Union All
Select ...
From TwoRandomProfiles
Where Not Exists( Select 1 From tblMonthlyProfile As MP1 )
Union All
Select ...
From TwoRandomProfiles
Where ( Select Count(*) From tblMonthlyProfile As MP1 ) = 1
And Num = 1
The CTE has the advantage of only querying for the random profiles once and the use of the ROW_NUMBER() column.
Obviously, in all the UNION statements the number and type of the columns must match.

TSQL Group By with an "OR"?

This query for creating a list of Candidate duplicates is easy enough:
SELECT Count(*), Can_FName, Can_HPhone, Can_EMail
FROM Can
GROUP BY Can_FName, Can_HPhone, Can_EMail
HAVING Count(*) > 1
But if the actual rule I want to check against is FName and (HPhone OR Email) - how can I adjust the GROUP BY to work with this?
I'm fairly certain I'm going to end up with a UNION SELECT here (i.e. do FName, HPhone on one and FName, EMail on the other and combine the results) - but I'd love to know if anyone knows an easier way to do it.
Thank you in advance for any help.
Scott in Maine
Before I can advise anything, I need to know the answer to this question:
name phone email
John 555-00-00 john#example.com
John 555-00-01 john#example.com
John 555-00-01 john-other#example.com
What COUNT(*) you want for this data?
Update:
If you just want to know that a record has any duplicates, use this:
WITH q AS (
SELECT 1 AS id, 'John' AS name, '555-00-00' AS phone, 'john#example.com' AS email
UNION ALL
SELECT 2 AS id, 'John', '555-00-01', 'john#example.com'
UNION ALL
SELECT 3 AS id, 'John', '555-00-01', 'john-other#example.com'
UNION ALL
SELECT 4 AS id, 'James', '555-00-00', 'james#example.com'
UNION ALL
SELECT 5 AS id, 'James', '555-00-01', 'james-other#example.com'
)
SELECT *
FROM q qo
WHERE EXISTS
(
SELECT NULL
FROM q qi
WHERE qi.id <> qo.id
AND qi.name = qo.name
AND (qi.phone = qo.phone OR qi.email = qo.email)
)
It's more efficient, but doesn't tell you where the duplicate chain started.
This query select all entries along with the special field, chainid, that indicates where the duplicate chain started.
WITH q AS (
SELECT 1 AS id, 'John' AS name, '555-00-00' AS phone, 'john#example.com' AS email
UNION ALL
SELECT 2 AS id, 'John', '555-00-01', 'john#example.com'
UNION ALL
SELECT 3 AS id, 'John', '555-00-01', 'john-other#example.com'
UNION ALL
SELECT 4 AS id, 'James', '555-00-00', 'james#example.com'
UNION ALL
SELECT 5 AS id, 'James', '555-00-01', 'james-other#example.com'
),
dup AS (
SELECT id AS chainid, id, name, phone, email, 1 as d
FROM q
UNION ALL
SELECT chainid, qo.id, qo.name, qo.phone, qo.email, d + 1
FROM dup
JOIN q qo
ON qo.name = dup.name
AND (qo.phone = dup.phone OR qo.email = dup.email)
AND qo.id > dup.id
),
chains AS
(
SELECT *
FROM dup do
WHERE chainid NOT IN
(
SELECT id
FROM dup di
WHERE di.chainid < do.chainid
)
)
SELECT *
FROM chains
ORDER BY
chainid
None of these answers is correct. Quassnoi's is a decent approach, but you will notice one fatal flaw in the expressions "qo.id > dup.id" and "di.chainid < do.chainid": comparisons made by ID! This is ALWAYS bad practice because it depends on some inherent ordering in the IDs. IDs should NEVER be given any implicit meaning and should ONLY participate in equality or null testing. You can easily break Quassnoi's solution in this example by simply reordering the IDs in the data.
The essential problem is a disjunctive condition with a grouping, which leads to the possibility of two records being related through an intermediate, though they are not directly relatable.
e.g., you stated these records should all be grouped:
(1) John 555-00-00 john#example.com
(2) John 555-00-01 john#example.com
(3) John 555-00-01 john-other#example.com
You can see that #1 and #2 are relatable, as are #2 and #3, but clearly #1 and #3 are not directly relatable as a group.
This establishes that a recursive or iterative solution is the ONLY possible solution.
So, recursion is not viable since you can easily end up in a looping situation. This is what Quassnoi was trying to avoid with his ID comparisons, but in doing so he broke the algorithm. You could try limiting the levels of recursion, but you may not then complete all relations, and you will still potentially be following loops back upon yourself, leading to excessive data size and prohibitive inefficiency.
The best solution is ITERATIVE: Start a result set by tagging each ID as a unique group ID, and then spin through the result set and update it, combining IDs into the same unique group ID as they match on the disjunctive condition. Repeat the process on the updated set each time until no further updates can be made.
I will create example code for this soon.
GROUP BY doesn't support OR - it's implicitly AND and must include every non-aggregator in the select list.
I assume you also have a unique ID integer as the primary key on this table. If you don't, it's a good idea to have one, for this purpose and many others.
Find those duplicates by a self-join:
select
c1.ID
, c1.Can_FName
, c1.Can_HPhone
, c1.Can_Email
, c2.ID
, c2.Can_FName
, c2.Can_HPhone
, c2.Can_Email
from
(
select
min(ID),
Can_FName,
Can_HPhone,
Can_Email
from Can
group by
Can_FName,
Can_HPhone,
Can_Email
) c1
inner join Can c2 on c1.ID < c2.ID
where
c1.Can_FName = c2.Can_FName
and (c1.Can_HPhone = c2.Can_HPhone OR c1.Can_Email = c2.Can_Email)
order by
c1.ID
The query gives you N-1 rows for each N duplicate combinations - if you want just a count along with each unique combination, count the rows grouped by the "left" side:
select count(1) + 1,
, c1.Can_FName
, c1.Can_HPhone
, c1.Can_Email
from
(
select
min(ID),
Can_FName,
Can_HPhone,
Can_Email
from Can
group by
Can_FName,
Can_HPhone,
Can_Email
) c1
inner join Can c2 on c1.ID < c2.ID
where
c1.Can_FName = c2.Can_FName
and (c1.Can_HPhone = c2.Can_HPhone OR c1.Can_Email = c2.Can_Email)
group by
c1.Can_FName
, c1.Can_HPhone
, c1.Can_Email
Granted, this is more involved than a union - but I think it illustrates a good way of thinking about duplicates.
Project the desired transformation first from a derived table, then do the aggregation:
SELECT COUNT(*)
, CAN_FName
, Can_HPhoneOrEMail
FROM (
SELECT Can_FName
, ISNULL(Can_HPhone,'') + ISNULL(Can_EMail,'') AS Can_HPhoneOrEMail
FROM Can) AS Can_Transformed
GROUP BY Can_FName, Can_HPhoneOrEMail
HAVING Count(*) > 1
Adjust your 'OR' operation as needed in the derived table project list.
I know this answer will be criticised for the use of the temp table, but it will work anyway:
-- create temp table to give the table a unique key
create table #tmp(
ID int identity,
can_Fname varchar(200) null, -- real type and len here
can_HPhone varchar(200) null, -- real type and len here
can_Email varchar(200) null, -- real type and len here
)
-- just copy the rows where a duplicate fname exits
-- (better performance specially for a big table)
insert into #tmp
select can_fname,can_hphone,can_email
from Can
where can_fname exists in (select can_fname from Can
group by can_fname having count(*)>1)
-- select the rows that have the same fname and
-- at least the same phone or email
select can_Fname, can_Hphone, can_Email
from #tmp a where exists
(select * from #tmp b where
a.ID<>b.ID and A.can_fname = b.can_fname
and (isnull(a.can_HPhone,'')=isnull(b.can_HPhone,'')
or (isnull(a.can_email,'')=isnull(b.can_email,'') )
Try this:
SELECT Can_FName, COUNT(*)
FROM (
SELECT
rank() over(partition by Can_FName order by Can_FName,Can_HPhone) rnk_p,
rank() over(partition by Can_FName order by Can_FName,Can_EMail) rnk_m,
Can_FName
FROM Can
) X
WHERE rnk_p=1 or rnk_m =1
GROUP BY Can_FName
HAVING COUNT(*)>1