Postgres group by similarity result

Postgres group by similarity result - postgresql

I have some tables with properties of Name and Result each.
i want to join tables by name similarity but limit the result arrived from each group of similarity into max 2 results.
CREATE TABLE data_a(
Id serial primary key,
Name VARCHAR(70) NOT NULL,
Result INT4 NOT NULL);
CREATE TABLE data_b(
Id serial primary key,
Name VARCHAR(70) NOT NULL,
Result INT4 NOT NULL);
INSERT INTO data_a
(Name, Result)
VALUES
('Todd', 2),
('John', 5);
INSERT INTO data_b
(Name, Result)
VALUES
('Johns', 5),
('Todi', 3),
('Tod', 4),
('Todd', 5),
('John', 1),
('Jon', 1),
('Johny', 1),
('Johnny', 1),
('Johni', 1);
i would like to run a query that join both tables by name similarity and limit results to up to 2 results
SELECT da.Name as Name_a,db.Name , similarity(da.Name,da.Name) > 0.5
FROM data_a da
JOIN data_b db
ON da.Name % db.Name
GROUP BY da.Name,db.Name
ORDER BY similarity
LIMIT 2
and recive
|Name_a|Name_b|similarity|
|------|------|----------|
|Todd | Todd | 1 |
|------|------|----------|
|Todd | Tod |0.8 |
|------|------|----------|
|John | John |1 |
|------|------|----------|
|John | Johny|0.76 |
|------|------|----------|
currently i get
|Name_a|Name_b|similarity|
|------|------|----------|
|Todd | Todd | 1 |
|John | John |1 |
it seems that i'm not using correctly in group by , how can i group by

If you want to apply the LIMIT separately per da.Name, you would have to do it in a LATERAL subquery:
SELECT da.Name as Name_a,db.Name , similarity(da.Name,db.Name)
FROM data_a da CROSS JOIN LATERAL
(
SELECT db.Name
FROM data_b db
WHERE da.Name % db.Name
ORDER BY similarity(da.Name,db.Name) DESC
LIMIT 2
) db;

You can use lateral join to map each name to a name from the first table, then truncate the result to 2 values for each name.
with data AS (
select
da.name da_name,
db.name db_name,
similarity(da.name::text, db.name::text) similarity
from
data_a da
left join lateral (
select
data_b.Name
from
data_b
) db ON da.Name like '%' || db.Name || '%'
or db.Name like '%' || da.Name || '%'
)
select
*
from
(
select
row_number() over (
partition by da_name
order by
similarity
) r,
t.*
FROM
data t
) x
WHERE
x.r <= 2;
Demo in sql<>daddy.io

Related

PostgreSQL select only last row from each recursion

Let's have an employee hierarchy given with the following table:
CREATE TABLE employee(
id serial PRIMARY KEY,
first_name varchar NOT NULL,
manager_id integer,
FOREIGN KEY(manager_id) REFERENCES employee(id)
);
INSERT INTO employee(first_name, manager_id)
VALUES('Arden', null),
('Oliver', null),
('Luisa', null),
('Amelia', null),
('Olivia', null),
('Lily', 2),
('Ava', 2),
('Isabella', 2),
('Charlie', 2),
('Beatrice', 3),
('Stephanie', 3),
('Emily', 3),
('Mila', 3),
('Isla', 4),
('Ashley', 4),
('James', 7),
('Jack', 7),
('William', 8),
('Harry', 8),
('Robin', 8);
For an employee, for instance with id = 20, we can find the highest level manager using query:
WITH RECURSIVE cte AS
(
SELECT
0 cnt, id, first_name, manager_id
FROM
employee
WHERE
id = 20
UNION ALL
SELECT
cnt+1, employee.id, employee.first_name, employee.manager_id
FROM
cte INNER JOIN employee ON cte.manager_id = employee.id
)SELECT * FROM cte WHERE cnt = (SELECT max(cnt) FROM cte);
But I need to obtain the entire list of employees --- highest_level_manager like the following:
employee | highest_level_manager
----------------------------------
Robin | Oliver
Harry | Oliver
William | Oliver
Jack | Oliver
James | Oliver
Ashley | Amelia
Isla | Amelia
Mila | Luisa
Emili | Luisa
Stephanie | Luisa
Beatrice | Luisa
Charlie | Oliver
Isabella | Oliver
Ava | Oliver
Lily | Oliver
Olivia | null
Amelia | null
Luisa | null
Oliver | null
Arden | null
Does anyone know how to do this?

Walk the tree for all employees. Reduce recursion to ids, add names in the final query:
with recursive cte as
(
select
id, manager_id, 0 as level
from
employee
union all
select
c.id, e.manager_id, level+ 1
from cte c
join employee e on c.manager_id = e.id and e.manager_id is not null
)
select
e.first_name as employee,
m.first_name as highest_level_manager
from (
select distinct on(id) *
from cte
order by id desc, level desc
) c
join employee e on c.id = e.id
left join employee m on c.manager_id = m.id
See live demo in Db<>fiddle.

Optimising T-SQL reporting performance

I have the table bellow, I need to delete opposite rows between two dates by pairs based on PerCode Value,
In fact, we delete rows inside the date range that have the same PerCode and have equal and opposite values.
The problem is that begin date and end date are provided by users as parameters while reporting but the query take too much time if i try to delete these at runtime.
Example:
Begin date = 01/01/2018
End date = 31/12/2018
I should delete rows 3 and 4.
Do u have any idea how to do that while optimising performance (the table have 200 Millions of rows)
+----+------------+---------+---------+-----------+
| Id | Date | PerCode | Value | IsDeleted |
+----+------------+---------+---------+-----------+
| 1 | 01/10/2017 | C1 | 10 | |
| 2 | 01/01/2018 | C1 | -10 | |
| 3 | 15/02/2018 | C2 | 20 | 1 |
| 4 | 10/03/2018 | C2 | -20 | 1 |
| 5 | 01/12/2018 | C3 | 15 | |
| 6 | 01/02/2019 | C3 | -15 | |
+----+------------+---------+---------------------+

I had a quick go at this, using a table variable to allow me to knock together a query using your test data. However, this might not perform well when used over 2 million rows?
DECLARE #table TABLE (id INT, [date] DATE, percode CHAR(2), [value] INT, isdeleted BIT);
INSERT INTO #table
SELECT 1, '20171001', 'C1', 10, NULL
UNION ALL
SELECT 2, '20180101', 'C1', -10, NULL
UNION ALL
SELECT 3, '20180215', 'C2', 20, NULL
UNION ALL
SELECT 4, '20180310', 'C2', -20, NULL
UNION ALL
SELECT 5, '20181201', 'C3', 15, NULL
UNION ALL
SELECT 6, '20190201', 'C3', -15, NULL;
DECLARE #date_from DATE = '20180101';
DECLARE #date_to DATE = '20181231';
WITH ordered AS (
SELECT
id,
percode,
[value],
ROW_NUMBER() OVER (PARTITION BY percode, [value] ORDER BY [value]) AS order_id
FROM
#table
WHERE
[date] BETWEEN #date_from AND #date_to
AND ISNULL(isdeleted, 0) != 1),
matches AS (
SELECT
m1.id AS match_1_id,
m2.id AS match_2_id
FROM
ordered m1
INNER JOIN ordered m2 ON m1.percode = m2.percode AND m1.[value] = m2.[value] * -1 AND m1.order_id = m2.order_id)
UPDATE
t
SET
isdeleted = 1
FROM
#table t
INNER JOIN matches m ON m.match_1_id = t.id OR m.match_2_id = t.id;
SELECT * FROM #table;
Results:
id date percode value isdeleted
1 2017-10-01 C1 10 NULL
2 2018-01-01 C1 -10 NULL
3 2018-02-15 C2 20 1
4 2018-03-10 C2 -20 1
5 2018-12-01 C3 15 NULL
6 2019-02-01 C3 -15 NULL
How does it work? Well I broke the task down into steps:
make a list of all rows in the date period specified, where they aren't already deleted;
for each row of data assign it a running count number, grouped by the percode and the value. So the first C1 10 would be number #1, then the second C1 10 would be number #2, etc.;
to find matches it's simply a case of finding any value that has the same percode, the equal and opposite value to another value group, and the same running count number;
where there's a match set the isdeleted flag to 1.

Here is my code but this is not performant over 200 millions rows in real time.
and in real life Percode is concatenation of 5 columns (date, varchar(13), varchar(2),varchar(1) and varchar(50)) and Value is 4 numeric columns.
I am searching for other ideas.
--DECLARE #table TABLE (id INT, [date] DATE, percode CHAR(2), [value] INT, isdeleted BIT);
Select * INTO #MasterTable FROM
(
SELECT 1 id, '20171001' [date], 'C1' percode, 10 [value], NULL isdeleted
UNION ALL
SELECT 2, '20180101', 'C1', -10, NULL
UNION ALL
SELECT 3, '20180215', 'C2', 20, NULL
UNION ALL
SELECT 4, '20180310', 'C2', -20, NULL
UNION ALL
SELECT 5, '20181201', 'C3', 15, NULL
UNION ALL
SELECT 6, '20190201', 'C3', -15, NULL
) T ;
DECLARE #date_from DATE = '20180101';
DECLARE #date_to DATE = '20181231';
select F.id
Into #TmpTable
from
(
select Id, PerCode, Value
,ROW_NUMBER() over (partition by PerCode, Value order by (select 0)) Rn2
from
#MasterTable ) F
inner join (
select
PerCode
, Rn1
from (
select
PerCode
,Value
,ROW_NUMBER() over (partition by PerCode, Value order by (select 0)) Rn1
FROM #MasterTable
where
[date] BETWEEN #date_from AND #date_to
) A
group by PerCode , Rn1
having sum(Value) = 0 and count(*)>1
) B on F.PerCode = B.PerCode
and F.Rn2 = B.Rn1
update R
set IsDeleted = 1
from #MasterTable R
inner join #TmpTable P
on R.id = P.id
select * from #MasterTable
drop table #MasterTable ;
drop table #TmpTable;

Hoe to split data of one column in multiple columns on the basis of a condition

I have one table having data
Category. New data
Cost of equipment. 23
Price of equipments. 45
Cost of M&C. 13
Price of M&C. 12
And one another table having
Category
Equipments
M&C
Now i want data as below
Category Cost Price
Equipment 23 45
M&C 13 12
Can you please help me in solving this

You may try this. A better approach is to change your table design.
Note that while joining I had to use RTRIM to remove s from equipments. I am not aware of any other variations in your data which might not match between the two tables. Please change the join conditions appropriately ( or use a REGEXP match instead of ILIKE if they don't )
SQL Fiddle
PostgreSQL 9.6 Schema Setup:
CREATE TABLE Table1
(Category varchar(19), New_data int)
;
INSERT INTO Table1
(Category, New_data)
VALUES
('Cost of equipment', 23),
('Price of equipments', 45),
('Cost of M&C', 13),
('Price of M&C', 12)
;
CREATE TABLE Table2
(Category varchar(10))
;
INSERT INTO Table2
(Category)
VALUES
('Equipments'),
('M&C')
;
Query 1:
WITH t1
AS (
SELECT b.category
,a.new_data
FROM TABLE1 a
INNER JOIN TABLE2 b ON a.Category ILIKE '%cost%' || RTRIM(b.Category, 's') || '%'
)
,t2
AS (
SELECT c.category
,a.new_data
FROM TABLE1 a
INNER JOIN TABLE2 c ON a.Category ILIKE '%price%' || RTRIM(c.Category, 's') || '%'
)
SELECT t1.category
,t1.new_data AS cost
,t2.new_data AS price
FROM t1
INNER JOIN t2 ON t1.category = t2.category
Results:
| category | cost | price |
|------------|------|-------|
| Equipments | 23 | 45 |
| M&C | 13 | 12 |

PostgreSQL UNION don't merge lines properly

I have 3 tables in a PostgreSQL database:
localities (loc, 12561 rows)
plants (pl, 17052 rows)
specimens or samples (esp, 9211 rows)
pl and esp each have a field loc, to specify where that tagged plant lives, or where that sample (usually a branch with leaves and flowers) came from.
I need a report of the places that have plants or samples, and the number of plants and samples in each place. The best I did up to now is the union of two subqueries, that runs very fast (33 ms to fetch 69 rows):
(select l.id,l.nome,count(pl.id) pls,null esps
from loc l
left join pl on pl.loc = l.id
where l.id in
(select distinct pl.loc
from pl
where pl.loc > 0)
group by l.id,l.nome
union
select l.id,l.nome,null pls,count(e.id) esps
from loc l
left join esp e on e.loc = l.id
where l.id in
(select distinct e.loc
from esp e
where e.loc > 0)
group by l.id,l.nome)
order by id
The point is, when the same place has both plants and samples, it becomes two distinct lines, like:
11950 | San Martin | | 5 |
11950 | San Martin | 61 | |
Of course what I want is:
11950 | San Martin | 61 | 5 |
Before that, I have tried doing all in one query:
select l.id,l.nome,count(pl.id),count(e.id) esps
from loc l
left join pl on pl.loc = l.id
left join esp e on e.loc = l.id
where l.id in
(select distinct pl.loc
from pl
where pl.loc > 0)
or l.id in
(select distinct e.loc
from esp e
where e.loc > 0)
group by l.id,l.nome
but it returns a strange repetition (it's multiplying both results and showing the result twice):
11950 | San Martin | 305 | 305 |
I have tried without subqueries, but it was taking about 13 seconds, which is too long.

I created test layout with:
create table localities (id integer, loc_name text);
create table plants (plant_id integer, loc_id integer);
create table samples (sample_id integer, loc_id integer);
insert into localities select x, ('Loc ' || x::text) from generate_series(1, 12561) x ;
insert into plants select x, (random()*12561)::integer from generate_series(1, 17052) x;
insert into samples select x, (random()*12561)::integer from generate_series(1, 9211) x;
The trick is to create an intermediate table from plants and samples but with same structure. Where data doesn't make sense (plant has no sample_id), you add null:
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples
This table has unified structure and you can then aggregate on it (I'm using WITH to make it a bit more readable.):
with localities_used as (
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples)
select
localities_used.loc_id,
count(localities_used.plant_id) plant_count,
count(localities_used.sample_id) sample_count
from
localities_used
group by
localities_used.loc_id;
If you need additional data from localities, you can join them on the aggregated table:
with localities_used as (
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples),
aggregated as (
select
localities_used.loc_id,
count(localities_used.plant_id) plant_count,
count(localities_used.sample_id) sample_count
from
localities_used
group by
localities_used.loc_id)
select * from aggregated left outer join localities on aggregated.loc_id = localities.id;
This takes 75ms on my laptop all together.

This should be as easy as
select * from (
select
location.*,
(select count(id) from plant where plant.location = location.id) as plants,
(select count(id) from sample where sample.location = location.id) as samples
from location
) subquery
where subquery.plants > 0 or subquery.samples > 0;
id | name | plants | samples
----+------------+--------+---------
1 | San Martin | 2 | 1
2 | Rome | 1 | 2
3 | Dallas | 3 | 1
(3 rows)
This is the database I quickly set up to experiment with:
create table location(id serial primary key, name text);
create table plant(id serial primary key, name text, location integer references location(id));
create table sample(id serial primary key, name text, location integer references location(id));
insert into location (name) values ('San Martin'), ('Rome'), ('Dallas'), ('Ghost Town');
insert into plant (name, location) values ('San Martin Dandelion', 1),('San Martin Camomile', 1), ('Rome Raspberry', 2), ('Dallas Locoweed', 3), ('Dallas Lemongrass', 3), ('Dallas Setaria', 3);
insert into sample (name, location) values ('San Martin Bramble', 1), ('Rome Iris', 2), ('Rome Eucalypt', 2), ('Dallas Dogbane', 3);
tests=# select * from location;
id | name
----+------------
1 | San Martin
2 | Rome
3 | Dallas
4 | Ghost Town
(4 rows)
tests=# select * from plant;
id | name | location
----+----------------------+----------
1 | San Martin Dandelion | 1
2 | San Martin Camomile | 1
3 | Rome Raspberry | 2
4 | Dallas Locoweed | 3
5 | Dallas Lemongrass | 3
6 | Dallas Setaria | 3
(6 rows)
tests=# select * from sample;
id | name | location
----+--------------------+----------
1 | San Martin Bramble | 1
2 | Rome Iris | 2
3 | Rome Eucalypt | 2
4 | Dallas Dogbane | 3
(4 rows)

I didn't test that but I think it could be something like this:
SELECT
l.id,
l.nome,
SUM(CASE WHEN pl.id IS NOT NULL THEN 1 ELSE 0 END) as plants_count,
SUM(CASE WHEN e.id IS NOT NULL THEN 1 ELSE 0 END) as esp_count
FROM loc l
LEFT JOIN pl ON pl.loc = l.id
LEFT JOIN esp e ON e.loc = l.id
GROUP BY l.id,l.nome
The point is to count non null ids of each type.

joining tables with different length column values

I need to get ID by joining columns of tables with variable length.
Table A has 2 columns ID and PostCode
-----------------
| ID | PostCode |
|----|----------|
| 1 | BR |
|----|----------|
| 2 | WT |
|----|----------|
| 3 | B71 |
|----|----------|
| 4 | BR5 |
|----|----------|
Table B has columns with Name and Full postcode
|------|----------|
| Name | PostCode |
|------|----------|
| Mr X | CR2 5ER |
|------|----------|
| Ms Y | BT2 6ER |
|------|----------|
| XX | B71 4WQ |
|------|----------|
| YY | BR4 8ER |
|------|----------|
| SS | BR5A 5RT |
|------|----------|
I need to get Id's 1 [BR->BR4 8ER], 3 [B71->B71 4WQ] and 4 [BR5->BR5A 5RT]
How do I get to work this?

select A.PostCode, B.PostCode as FullPostCode, B.Name
from A
join B
on substring(B.PostCode,0,len(A.PostCode)) = A.PostCode

Consider the postcode BR29 8LN. If table A has codes B and BR, this postcode will be captured TWICE - not what the OP would want, and not what I wanted.
The below captures everything so long as after the postcode prefix, there is a number thus delimiting the postcode area:
select A.PostCode, B.PostCode as FullPostCode, B.Name
from B
inner join A
on substring(B.PostCode ,0,len(A.PostCode)+1) = A.PostCode
WHERE IsNumeric(substring(B.PostCode ,len(A.PostCode)+1,1)) = 1

This may help.
DECLARE #TableA TABLE (UserID INT,
PostCode VARCHAR(10))
DECLARE #TableB TABLE (Name VARCHAR(10),
PostCode VARCHAR(10))
INSERT INTO #TableA
VALUES
('1', 'BR'),
('2', 'WT'),
('3', 'B71'),
('4', 'BR5')
INSERT INTO #TableB
VALUES
('Mr X', 'CR2 5ER'),
('Ms Y', 'BT2 6ER'),
('XX', 'B71 4WQ'),
('YY', 'BR4 8ER'),
('SS', 'BR5A 5RT');
WITH CTE
AS (
SELECT CAST(UserID AS VARCHAR(10)) AS UserID,
Name,
tb.PostCode,
ta.PostCode AS PostCode2
,
ROW_NUMBER() OVER (PARTITION BY UserID ORDER BY tb.PostCode DESC) AS PcID
FROM #TableA AS ta
JOIN #TableB AS tb
ON ta.PostCode = LEFT(tb.PostCode, LEN(ta.PostCode))
)
, cte2
AS (
SELECT STUFF((SELECT ', ' + c2.UserID + ' [' + c2.PostCode2 + '-' + c2.PostCode + ']'
FROM cte AS c2
WHERE c1.UserID = c2.UserID
AND PcID = 1
FOR XML PATH('')), 1, 2, '') AS PostCodeMatch
FROM cte AS c1
WHERE PcID = 1
)
SELECT DISTINCT STUFF((SELECT ', ' + PostCodeMatch
FROM cte2 AS c2
FOR XML PATH('')), 1, 2, '') AS PostCodeMatch
FROM cte2

You might do something like this:
select A.PostCode, B.PostCode as FullPostCode, B.Name
from A
join B on B.PostCode like A.PostCode + '%'

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Postgres group by similarity result - postgresql

Related

PostgreSQL select only last row from each recursion

Optimising T-SQL reporting performance

Hoe to split data of one column in multiple columns on the basis of a condition

PostgreSQL UNION don't merge lines properly

joining tables with different length column values

Categories

Resources