Find exact FK matches - tsql

Have a very large table (over 200 million rows)
sID int, wordID int (PK sID, wordID)
Want to find the sID's that have the exact same wordID's (and no extras)
For a sID with over 100 wordID the chance of an exact match goes down so willing to limit it to 100
(but would like to go to 1000)
If this was school and sID were classes and wordID were students.
Then I want to find classes that have the exact same students.
sID, wordID
1, 1
1, 2
1, 3
2, 2
2, 3
3, 1
3, 4
5, 1
5, 2
6, 2
6, 3
7, 1
7, 2
8, 1
8, 1
sID 6 and 2 have the exact same wordID's
sID 5, 7, and 8 have the exact same wordID's
This is what I have so far
I would like to eliminate the two delete #temp3_sID1_sID2 and take care of that in the insert above
But I will try any ideas
It is not like you can easily create a table with 200 million rows to test with
drop table #temp_sID_wordCount
drop table #temp_count_wordID_sID
drop table #temp3_wordID_sID_forThatCount
drop table #temp3_sID1_sID2
drop table #temp3_sID1_sID2_keep
create table #temp_sID_wordCount (sID int primary key, ccount int not null)
create table #temp_count_wordID_sID (ccount int not null, wordID int not null, sID int not null, primary key (ccount, wordID, sID))
create table #temp3_wordID_sID_forThatCount (wordID int not null, sID int not null, primary key(wordID, sID))
create table #temp3_sID1_sID2_keep (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
create table #temp3_sID1_sID2 (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
insert into #temp_sID_wordCount
select sID, count(*) as ccount
FROM [FTSindexWordOnce] with (nolock)
group by sID
order by sID;
select count(*) from #temp_sID_wordCount where ccount <= 100; -- 701,966
truncate table #temp_count_wordID_sID
insert into #temp_count_wordID_sID
select #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID
from #temp_sID_wordCount
join [FTSindexWordOnce] with (nolock)
on [FTSindexWordOnce].sID = #temp_sID_wordCount.sID
and ccount >= 1 and ccount <= 10
order by #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID;
select count(*) from #temp_sID_wordCount; -- 34,860,090
truncate table #temp3_sID1_sID2_keep
declare cur cursor for
select top 10 ccount from #temp_count_wordID_sID group by ccount order by ccount
open cur
declare #count int, #sIDcur int
fetch next from cur into #count
while (##FETCH_STATUS = 0)
begin
--print (#count)
--select count(*), #count from #temp_sID_wordCount where #temp_sID_wordCount.ccount = #count
truncate table #temp3_wordID_sID_forThatCount
truncate table #temp3_sID1_sID2
-- wordID and sID for that unique word count
-- they can only be exact if they have the same word count
insert into #temp3_wordID_sID_forThatCount
select #temp_count_wordID_sID.wordID
, #temp_count_wordID_sID.sID
from #temp_count_wordID_sID
where #temp_count_wordID_sID.ccount = #count
order by #temp_count_wordID_sID.wordID, #temp_count_wordID_sID.sID
-- select count(*) from #temp3_wordID_sID_forThatCount
-- this has some duplicates
-- sID1 is the group
insert into #temp3_sID1_sID2
select w1.sID, w2.sID
from #temp3_wordID_sID_forThatCount as w1 with (nolock)
join #temp3_wordID_sID_forThatCount as w2 with (nolock)
on w1.wordID = w2.wordID
and w1.sID <= w2.sID
group by w1.sID, w2.sID
having count(*) = #count
order by w1.sID, w2.sID
-- get rid of the goups of 1
delete #temp3_sID1_sID2
where sID1 in (select sID1 from #temp3_sID1_sID2 group by sID1 having count(*) = 1)
-- get rid of the double dips
delete #temp3_sID1_sID2
where #temp3_sID1_sID2.sID1 in
(select distinct s1del.sID1 -- these are the double dips
from #temp3_sID1_sID2 as s1base with (nolock)
join #temp3_sID1_sID2 as s1del with (nolock)
on s1del.sID1 > s1base.sID1
and s1Del.sID1 = s1base.sID2)
insert into #temp3_sID1_sID2_keep
select #temp3_sID1_sID2.sID1
, #temp3_sID1_sID2.sID2
from #temp3_sID1_sID2 with (nolock)
order by #temp3_sID1_sID2.sID1, #temp3_sID1_sID2.sID2
fetch next from cur into #count
end
close cur
deallocate cur
select *
FROM #temp3_sID1_sID2_keep with (nolock)
order by 1,2

So, as I see, the task is to find equal subsets.
First we can find pairs of equal subsets:
;with tmp1 as (select sID, cnt = count(wordID) from [Table] group by sID)
select s1.sID, s2.sID
from tmp1 s1
cross join tmp1 s2
cross apply (
select count(1)
from [Table] d1
join [Table] d2 on d2.wordID = d1.wordID
where d1.sID = s1.sID and d2.sID = s2.sID
) c(cnt)
where s1.cnt = s2.cnt
and s1.sID > s2.sID
and s1.cnt = c.cnt
Output is:
sID sID
----------- -----------
6 2
7 5
8 5
8 7
And then pairs can be combined into groups, if necessary:
sID gNum
----------- -----------
2 1
6 1
5 2
7 2
8 2
See details in SqlFiddle sample below.
SqlFiddle Sample
The other approach is to calculate hash function for every subset data:
;with a as (
select distinct sID from [Table]
)
select sID,
hashbytes('sha1', (
select cast(wordID as varchar(10)) + '|'
from [Table]
where sID = a.sID
order by wordID
for xml path('')))
from a
Then subsets can be grouped based on hash value.
SqlFiddle Sample
The last one took less than a minute on my machine for a test data of about 10 million rows (20k sID values up to 1k wordID each). Also you can optimize it by excluding sIDs having no wordID count matches to any other.

Related

sql compare columns to get result

I have the following issue. I have products with 3 different states. Parent, Child and products which are orphans. I am setting Parents as 1, Children as 2 and Orphans as 0. I am struggling to get the Orphan to set to 0. I realise that counting the amount of Parent PLU's is where I am going wrong but I do not know how to resolve this issue. Any help would be appreciated. (As you maybe able to tell, I am a noob and constructive criticism would be appreciated)
Kind Regards,
Jason.
Picture of results from query
declare #OrderID int = 1635
declare #Store char(3) = '001'
declare #SortedBy smallint = 2
DECLARE #tbl TABLE (DetailID int, OrderID int, PLU nvarchar(35), ParentPLU nvarchar(35))
INSERT INTO #tbl (DetailID, OrderID, PLU, ParentPLU)
SELECT DetailID, OrderDetails.OrderID, OrderDetails.PLU, OrderDetails.ParentPLU
FROM OrderDetails
INNER JOIN PLU
ON PLU.PLU = OrderDetails.PLU
WHERE OrderDetails.OrderID = #OrderID
AND OrderDetails.OrderStore = #Store
SELECT DetailID, OrderID, PLU, ParentPLU,
CASE WHEN ( SELECT COUNT(DISTINCT ParentPLU)
FROM #tbl
WHERE ParentPLU IN (SELECT PLU FROM #tbl)
) > 0 AND ParentPLU = '' THEN 1
WHEN ( SELECT COUNT(DISTINCT ParentPLU)
FROM #tbl
WHERE ParentPLU IN (SELECT PLU FROM #tbl)
) > 0 THEN 2
ELSE
0
END AS ParentChild,
ROW_NUMBER() OVER (ORDER BY
CASE WHEN #SortedBy = 1 THEN OrderID END ASC,
CASE WHEN #SortedBy = 2 THEN DetailID END ASC
) AS ID
FROM #tbl
You can use coalesce to get your desired result. First subquery checks for parent state, second for children. If both are null, then it is orphan
select
DetailID, OrderID, PLU, ParentPLU
, coalesce((
select
distinct 1
from
#tbl b
where
a.PLU = b.ParentPlu
)
, (
select
distinct 2
from
#tbl b
where
b.PLU = a.ParentPlu
), 0)
from
#tbl a

Recursive CTE to get a Category and all its ancestors [duplicate]

Given a child id, I need to return a query containing all parents of that child as well as their parents till I get to the root parent.
For example, given this data:
ID / Parent ID
1 / 0
2 / 1
3 / 2
4 / 0
5 / 3
So if I passed in ID 5 I would like to get a query with the results:
ID / Parent ID
1 / 0
2 / 1
3 / 2
This table does not work with a hierarchyid type so I suspect that this will need to be done with a CTE, but have no clue how. If it can be done in an SQL query / proc, any help would be appreciated.
Thanks
This is more or less what you want:
-- CTE to prepare hierarchical result set
;WITH #results AS
(
SELECT id,
parentid
FROM [table]
WHERE id = #childId
UNION ALL
SELECT t.id,
t.parentid
FROM [table] t
INNER JOIN #results r ON r.parentid = t.id
)
SELECT *
FROM #results;
Reference:
CTE: Common Table Expression
Working example:
-- create table with self lookup (parent id)
CREATE TABLE #tmp (id INT, parentid INT);
-- insert some test data
INSERT INTO #tmp (id, parentid)
SELECT 1,0 UNION ALL SELECT 2,1 UNION ALL SELECT 3,2
UNION ALL SELECT 4,0 UNION ALL SELECT 5,3;
-- prepare the child item to look up
DECLARE #childId INT;
SET #childId = 5;
-- build the CTE
WITH #results AS
(
SELECT id,
parentid
FROM #tmp
WHERE id = #childId
UNION ALL
SELECT t.id,
t.parentid
FROM #tmp t
INNER JOIN #results r ON r.parentid = t.id
)
-- output the results
SELECT *
FROM #results
WHERE id != #childId
ORDER BY id;
-- cleanup
DROP TABLE #tmp;
Output:
1 | 0
2 | 1
3 | 2

Remove Duplicates

I have a table like below:
SuppID AreaID SuppNo SupName SupPrice
------------------------------------------------
1 3 526 ANC 100
1 3 985 JTT 200
3 4 100 HIK 300
In the above table, for same SuppID(1) and same AreaID(3), different SuppNo are there (526 & 985) in two different rows.
In this scenario , I'd like to make those two rows into a single row with SuppNo field as blank.
Also my output result should display rows with all the columns.
Any Help?
This should get you started:
DECLARE #TABLE TABLE (SuppID INT, AreaID INT, SuppNo VARCHAR(5), SupName VARCHAR(5), SupPrice INT)
INSERT INTO #TABLE
SELECT 1,3,'526','ANC',100 UNION
SELECT 1,3,'985','JTT',200 UNION
SELECT 3,4,'100','HIK',300
-- select data before updates
SELECT * FROM #TABLE
-- add a row count by AreaID/SuppID
;WITH T1 AS
(
SELECT *
,SUM(1) OVER(PARTITION BY AREAID,SUPPID) AS ROWCNT
FROM #TABLE
)
-- set the SuppNo blank on rows that have more than 1 match
UPDATE T1 SET SuppNo='' WHERE ROWCNT>1
-- add a row # by AreaID/SuppID
;WITH T2 AS
(
SELECT *
,ROW_NUMBER() OVER(PARTITION BY AREAID,SUPPID ORDER BY AREAID,SUPPID) AS ROWID
FROM #TABLE
)
-- delete duplicate rows
DELETE
FROM T2
WHERE ROWID>1
-- select data after updates
SELECT * FROM #TABLE

SQL Running Subtraction and Deviation

-- Just a brief of business scenario is table has been created for a good receipt.
-- So here we have good expected line with PurchaseOrder(PO) in first few line.
-- And then we receive each expected line physically and that time these quantity may be different
-- due to business case like quantity may damage and short quantity like that.
-- So we maintain a status for that eg: OK, Damage, also we have to calculate short quantity
-- based on total of expected quantity of each item and total of received line.
if object_id('DEV..Temp','U') is not null
drop table Temp
CREATE TABLE Temp
(
ID INT IDENTITY(1,1) PRIMARY KEY CLUSTERED,
Item VARCHAR(32),
PO VARCHAR(32) NULL,
ExpectedQty INT NULL,
ReceivedQty INT NULL,
[STATUS] VARCHAR(32) NULL,
BoxName VARCHAR(32) NULL
)
-- Please see first few line with PO data will be the expected lines,
-- and then rest line will be received line
INSERT INTO TEMP (Item,PO,ExpectedQty,ReceivedQty,[STATUS],BoxName)
SELECT 'ITEM01','PO-01','30',NULL,NULL,NULL UNION ALL
SELECT 'ITEM01','PO-02','20',NULL,NULL,NULL UNION ALL
SELECT 'ITEM02','PO-01','40',NULL,NULL,NULL UNION ALL
SELECT 'ITEM03','PO-01','50',NULL,NULL,NULL UNION ALL
SELECT 'ITEM03','PO-02','30',NULL,NULL,NULL UNION ALL
SELECT 'ITEM03','PO-03','20',NULL,NULL,NULL UNION ALL
SELECT 'ITEM04','PO-01','30',NULL,NULL,NULL UNION ALL
SELECT 'ITEM01',NULL,NULL,'20','OK','box01' UNION ALL
SELECT 'ITEM01',NULL,NULL,'25','OK','box02' UNION ALL
SELECT 'ITEM01',NULL,NULL,'5','DAMAGE','box03' UNION ALL
SELECT 'ITEM02',NULL,NULL,'38','OK','box04' UNION ALL
SELECT 'ITEM02',NULL,NULL,'2','DAMAGE','box05' UNION ALL
SELECT 'ITEM03',NULL,NULL,'30','OK','box06' UNION ALL
SELECT 'ITEM03',NULL,NULL,'30','OK','box07' UNION ALL
SELECT 'ITEM03',NULL,NULL,'10','DAMAGE','box09' UNION ALL
SELECT 'ITEM04',NULL,NULL,'25','OK','box10'
-- Below Table is my expected result based on above data.
-- I need to show those data following way.
-- So I appreciate if you can give me an appropriate query for it.
-- Note: first row is blank and it is actually my table header. :)
-- Conditions : any of row, we cant have ReceivedQty, DamageQty and ShortQty
-- values more than ExpectedQty value. Item03 has this scenario
-- Query should run in SQL 2000 DB
SELECT ''as'ITEM', ''as'PO#', ''as'ExpectedQty',''as'ReceivedQty',''as'DamageQty' ,''as'ShortQty' UNION ALL
SELECT 'ITEM01','PO-01','30','30','0' ,'0' UNION ALL
SELECT 'ITEM01','PO-02','20','15','5' ,'0' UNION ALL
SELECT 'ITEM02','PO-01','40','38','2' ,'0' UNION ALL
SELECT 'ITEM03','PO-01','50','50','0' ,'0' UNION ALL
SELECT 'ITEM03','PO-02','30','20','10' ,'10' UNION ALL
SELECT 'ITEM03','PO-03','20','0','0','20' UNION ALL
SELECT 'ITEM04','PO-01','30','25','0' ,'5'
Using this solution as a starting point, I've eventually ended up with this:
SELECT
Item,
PO,
ExpectedQty,
ReceivedQty = CASE
WHEN RemainderQty >= 0 THEN ExpectedQty
WHEN RemainderQty < -ExpectedQty THEN 0
ELSE RemainderQty + ExpectedQty
END,
DamageQty = CASE
WHEN RemainderQty >=0 OR ExpectedQty < -TotalRemainderQty THEN 0
WHEN RemainderQty < -ExpectedQty AND TotalRemainderQty > 0 THEN ExpectedQty
WHEN RemainderQty < -ExpectedQty AND TotalRemainderQty < -DamagedQty THEN ExpectedQty + TotalRemainderQty
WHEN RemainderQty > -DamagedQty THEN -RemainderQty
ELSE DamagedQty
END,
ShortQty = CASE
WHEN TotalRemainderQty >= 0 THEN 0
WHEN TotalRemainderQty < -ExpectedQty THEN ExpectedQty
ELSE -TotalRemainderQty
END
FROM (
SELECT
a.Item,
a.PO,
a.ExpectedQty,
b.DamagedQty,
RemainderQty = b.ReceivedQty - a.RunningTotalQty,
TotalRemainderQty = b.ReceivedQty + b.DamagedQty - a.RunningTotalQty
FROM (
SELECT
a.Item,
a.PO,
a.ExpectedQty,
RunningTotalQty = SUM(a2.ExpectedQty)
FROM (SELECT Item, PO, ExpectedQty FROM Temp WHERE STATUS IS NULL) AS a
INNER JOIN (SELECT Item, PO, ExpectedQty FROM Temp WHERE STATUS IS NULL) AS a2
ON a.Item = a2.Item AND a.PO >= a2.PO
GROUP BY
a.Item,
a.PO,
a.ExpectedQty
) a
LEFT JOIN (
SELECT
Item,
ReceivedQty = SUM(CASE STATUS WHEN 'OK' THEN ReceivedQty ELSE 0 END),
DamagedQty = SUM(CASE STATUS WHEN 'DAMAGE' THEN ReceivedQty ELSE 0 END)
FROM Temp
GROUP BY Item
) b ON a.Item = b.Item
) s;

one column split to more column sql server 2008?

Table name: Table1
id name
1 1-aaa-14 milan road
2 23-abcde-lsd road
3 2-mnbvcx-welcoome street
I want the result like this:
Id name name1 name2
1 1 aaa 14 milan road
2 23 abcde lsd road
3 2 mnbvcx welcoome street
This function ought to give you what you need.
--Drop Function Dbo.Part
Create Function Dbo.Part
(#Value Varchar(8000)
,#Part Int
,#Sep Char(1)='-'
)Returns Varchar(8000)
As Begin
Declare #Start Int
Declare #Finish Int
Set #Start=1
Set #Finish=CharIndex(#Sep,#Value,#Start)
While (#Part>1 And #Finish>0)Begin
Set #Start=#Finish+1
Set #Finish=CharIndex(#Sep,#Value,#Start)
Set #Part=#Part-1
End
If #Part>1 Set #Start=Len(#Value)+1 -- Not found
If #Finish=0 Set #Finish=Len(#Value)+1 -- Last token on line
Return SubString(#Value,#Start,#Finish-#Start)
End
Usage:
Select ID
,Dbo.Part(Name,1,Default)As Name
,Dbo.Part(Name,2,Default)As Name1
,Dbo.Part(Name,3,Default)As Name2
From Dbo.Table1
It's rather compute-intensive, so if Table1 is very long you ought to write the results to another table, which you could refresh from time to time (perhaps once a day, at night).
Better yet, you could create a trigger, which automatically updates Table2 whenever a change is made to Table1. Assuming that column ID is primary key:
Create Table Dbo.Table2(
ID Int Constraint PK_Table2 Primary Key,
Name Varchar(8000),
Name1 Varchar(8000),
Name2 Varchar(8000))
Create Trigger Trigger_Table1 on Dbo.Table1 After Insert,Update,Delete
As Begin
If (Select Count(*)From Deleted)>0
Delete From Dbo.Table2 Where ID=(Select ID From Deleted)
If (Select Count(*)From Inserted)>0
Insert Dbo.Table2(ID, Name, Name1, Name2)
Select ID
,Dbo.Part(Name,1,Default)
,Dbo.Part(Name,2,Default)
,Dbo.Part(Name,3,Default)
From Inserted
End
Now, do your data manipulation (Insert, Update, Delete) on Table1, but do your Select statements on Table2 instead.
The below solution uses a recursive CTE for splitting the strings, and PIVOT for displaying the parts in their own columns.
WITH Table1 (id, name) AS (
SELECT 1, '1-aaa-14 milan road' UNION ALL
SELECT 2, '23-abcde-lsd road' UNION ALL
SELECT 3, '2-mnbvcx-welcoome street'
),
cutpositions AS (
SELECT
id, name,
rownum = 1,
startpos = 1,
nextdash = CHARINDEX('-', name + '-')
FROM Table1
UNION ALL
SELECT
id, name,
rownum + 1,
nextdash + 1,
CHARINDEX('-', name + '-', nextdash + 1)
FROM cutpositions c
WHERE nextdash < LEN(name)
)
SELECT
id,
[1] AS name,
[2] AS name1,
[3] AS name2
/* add more columns here */
FROM (
SELECT
id, rownum,
part = SUBSTRING(name, startpos, nextdash - startpos)
FROM cutpositions
) s
PIVOT ( MAX(part) FOR rownum IN ([1], [2], [3] /* extend the list here */) ) x
Without additional modifications this query can split names consisting of up to 100 parts (that's the default maximum recursion depth, which can be changed), but can only display no more than 3 of them. You can easily extend it to however many parts you want it to display, just follow the instructions in the comments.
select T.id,
substring(T.Name, 1, D1.Pos-1) as Name,
substring(T.Name, D1.Pos+1, D2.Pos-D1.Pos-1) as Name1,
substring(T.Name, D2.Pos+1, len(T.name)) as Name2
from Table1 as T
cross apply (select charindex('-', T.Name, 1)) as D1(Pos)
cross apply (select charindex('-', T.Name, D1.Pos+1)) as D2(Pos)
Testing performance of suggested solutions
Setup:
create table Table1
(
id int identity primary key,
Name varchar(50)
)
go
insert into Table1
select '1-aaa-14 milan road' union all
select '23-abcde-lsd road' union all
select '2-mnbvcx-welcoome street'
go 10000
Result:
if you always will have 2 dashes, you can do the following by using PARSENAME
--testing table
CREATE TABLE #test(id INT, NAME VARCHAR(1000))
INSERT #test VALUES(1, '1-aaa-14 milan road')
INSERT #test VALUES(2, '23-abcde-lsd road')
INSERT #test VALUES(3, '2-mnbvcx-welcoome street')
SELECT id,PARSENAME(name,3) AS name,
PARSENAME(name,2) AS name1,
PARSENAME(name,1)AS name2
FROM (
SELECT id,REPLACE(NAME,'-','.') NAME
FROM #test)x
if you have dots in the name column you have to first replace them and then replace them back to dots in the end
example, by using a tilde to substitute the dot
INSERT #test VALUES(3, '5-mnbvcx-welcoome street.')
SELECT id,REPLACE(PARSENAME(name,3),'~','.') AS name,
REPLACE(PARSENAME(name,2),'~','.') AS name1,
REPLACE(PARSENAME(name,1),'~','.') AS name2
FROM (
SELECT id,REPLACE(REPLACE(NAME,'.','~'),'-','.') NAME
FROM #test)x