Extract words before and after a specific word - tsql

I need to extract words before and after a word like '%don%' in a ntext column.
table A, column name: Text
Example:
TEXT
where it was done it will retrieve the...
at the end of the trip clare done everything to improve
it is the only one done in these times
I would like the following results:
was done it
clare done everything
one done in
I am using T-SQL, Left and right functions did not work with ntext data type of the column containing text.

As others have said, you can use a string splitting function to split out each word and then return those you require. Using the previously linked DelimitedSplit8K:
CREATE FUNCTION dbo.DelimitedSplit8K
--===== Define I/O parameters
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
--WARNING!!! DO NOT USE MAX DATA-TYPES HERE! IT WILL KILL PERFORMANCE!
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 1 up to 10,000...
-- enough to cover VARCHAR(8000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
;
go
declare #t table (t ntext);
insert into #t values('where it was done it will retrieve the...'),('at the end of the trip clare done everything to improve'),('we don''t take donut donations here'),('ending in don');
with t as (select cast(t as nvarchar(max)) as t from #t)
,d as (select t.t
,case when patindex('%don%',s.Item) > 0 then 1 else 0 end as d
,s.ItemNumber as i
,lag(s.Item,1,'') over (partition by t.t order by s.ItemNumber) + ' '
+ s.Item + ' '
+ lead(s.Item,1,'') over (partition by t.t order by s.ItemNumber) as r
from t
cross apply dbo.DelimitedSplit8K(t.t, ' ') as s
)
select t
,r
from d
where d = 1
order by t
,i;
Output:
+---------------------------------------------------------+-----------------------+
| t | r |
+---------------------------------------------------------+-----------------------+
| at the end of the trip clare done everything to improve | clare done everything |
| ending in don | in don |
| we don't take donut donations here | we don't take |
| we don't take donut donations here | take donut donations |
| we don't take donut donations here | donut donations here |
| where it was done it will retrieve the... | was done it |
+---------------------------------------------------------+-----------------------+
And a working example:
http://rextester.com/RND43071

Related

Cascading sum hierarchy using recursive cte

I'm trying to perform recursive cte with postgres but I can't wrap my head around it. In terms of performance issue there are only 50 items in TABLE 1 so this shouldn't be an issue.
TABLE 1 (expense):
id | parent_id | name
------------------------------
1 | null | A
2 | null | B
3 | 1 | C
4 | 1 | D
TABLE 2 (expense_amount):
ref_id | amount
-------------------------------
3 | 500
4 | 200
Expected Result:
id, name, amount
-------------------------------
1 | A | 700
2 | B | 0
3 | C | 500
4 | D | 200
Query
WITH RECURSIVE cte AS (
SELECT
expenses.id,
name,
parent_id,
expense_amount.total
FROM expenses
WHERE expenses.parent_id IS NULL
LEFT JOIN expense_amount ON expense_amount.expense_id = expenses.id
UNION ALL
SELECT
expenses.id,
expenses.name,
expenses.parent_id,
expense_amount.total
FROM cte
JOIN expenses ON expenses.parent_id = cte.id
LEFT JOIN expense_amount ON expense_amount.expense_id = expenses.id
)
SELECT
id,
SUM(amount)
FROM cte
GROUP BY 1
ORDER BY 1
Results
id | sum
--------------------
1 | null
2 | null
3 | 500
4 | 200
You can do a conditional sum() for only the root row:
with recursive tree as (
select id, parent_id, name, id as root_id
from expense
where parent_id is null
union all
select c.id, c.parent_id, c.name, p.root_id
from expense c
join tree p on c.parent_id = p.id
)
select e.id,
e.name,
e.root_id,
case
when e.id = e.root_id then sum(ea.amount) over (partition by root_id)
else amount
end as amount
from tree e
left join expense_amount ea on e.id = ea.ref_id
order by id;
I prefer doing the recursive part first, then join the related tables to the result of the recursive query, but you could do the join to the expense_amount also inside the CTE.
Online example: http://rextester.com/TGQUX53703
However, the above only aggregates on the top-level parent, not for any intermediate non-leaf rows.
If you want to see intermediate aggregates as well, this gets a bit more complicated (and is probably not very scalable for large results, but you said your tables aren't that big)
with recursive tree as (
select id, parent_id, name, 1 as level, concat('/', id) as path, null::numeric as amount
from expense
where parent_id is null
union all
select c.id, c.parent_id, c.name, p.level + 1, concat(p.path, '/', c.id), ea.amount
from expense c
join tree p on c.parent_id = p.id
left join expense_amount ea on ea.ref_id = c.id
)
select e.id,
lpad(' ', (e.level - 1) * 2, ' ')||e.name as name,
e.amount as element_amount,
(select sum(amount)
from tree t
where t.path like e.path||'%') as sub_tree_amount,
e.path
from tree e
order by path;
Online example: http://rextester.com/MCE96740
The query builds up a path of all IDs belonging to a (sub)tree and then uses a scalar sub-select to get all child rows belonging to a node. That sub-select is what will make this quite slow as soon as the result of the recursive query can't be kept in memory.
I used the level column to create a "visual" display of the tree structure - this helps me debugging the statement and understanding the result better. If you need the real name of an element in your program you would obviously only use e.name instead of pre-pending it with blanks.
I could not get your query to work for some reason. Here's my attempt that works for the particular table you provided (parent-child, no grandchild) without recursion. SQL Fiddle
--- step 1: get parent-child data together
with parent_child as(
select t.*, amount
from
(select e.id, f.name as name,
coalesce(f.name, e.name) as pname
from expense e
left join expense f
on e.parent_id = f.id) t
left join expense_amount ea
on ea.ref_id = t.id
)
--- final step is to group by id, name
select id, pname, sum(amount)
from
(-- step 2: group by parent name and find corresponding amount
-- returns A, B
select e.id, t.pname, t.amount
from expense e
join (select pname, sum(amount) as amount
from parent_child
group by 1) t
on t.pname = e.name
-- step 3: to get C, D we union and get corresponding columns
-- results in all rows and corresponding value
union
select id, name, amount
from expense e
left join expense_amount ea
on e.id = ea.ref_id
) t
group by 1, 2
order by 1;

Populate random data from another table

update dataset1.test
set column4 = (select column1
from dataset2
order by random()
limit 1
)
I have to update dataset1 of column 4 with each row updating a random entry from dataset 2 column.. But by far now in this above query I get only one random entry in all the rows of dataset1 and its all same which I want it to be random.
SETUP
Let's start by assuming your tables an data are the following ones.
Note that I assume that dataset1 has a primary key (it can be a composite one, but, for the sake of simplicity, let's make it an integer):
CREATE TABLE dataset1
(
id INTEGER PRIMARY KEY,
column4 TEXT
) ;
CREATE TABLE dataset2
(
column1 TEXT
) ;
We fill both tables with sample data
INSERT INTO dataset1
(id, column4)
SELECT
i, 'column 4 for id ' || i
FROM
generate_series(101, 120) AS s(i);
INSERT INTO dataset2
(column1)
SELECT
'SOMETHING ' || i
FROM
generate_series (1001, 1020) AS s(i) ;
Sanity check:
SELECT count(DISTINCT column4) FROM dataset1 ;
| count |
| ----: |
| 20 |
Case 1: number of rows in dataset1 <= rows in dataset2
We'll perform a complete shuffling. Values from dataset2 will be used once, and no more than once.
EXPLANATION
In order to make an update that shuffles all the values from column4 in a
random fashion, we need some intermediate steps.
First, for the dataset1, we need to create a list (relation) of tuples (id, rn), that
are just:
(id_1, 1),
(id_2, 2),
(id_3, 3),
...
(id_20, 20)
Where id_1, ..., id_20 are the ids present on dataset1.
They can be of any type, they need not be consecutive, and they can be composite.
For the dataset2, we need to create another list of (column_1,rn), that looks like:
(column1_1, 17),
(column1_2, 3),
(column1_3, 11),
...
(column1_20, 15)
In this case, the second column contains all the values 1 .. 20, but shuffled.
Once we have the two relations, we JOIN them ON ... rn. This, in practice, produces yet another list of tuples with (id, column1), where the pairing has been done randomly. We use these pairs to update dataset1.
THE REAL QUERY
This can all be done (clearly, I hope) by using some CTE (WITH statement) to hold the intermediate relations:
WITH original_keys AS
(
-- This creates tuples (id, rn),
-- where rn increases from 1 to number or rows
SELECT
id,
row_number() OVER () AS rn
FROM
dataset1
)
, shuffled_data AS
(
-- This creates tuples (column1, rn)
-- where rn moves between 1 and number of rows, but is randomly shuffled
SELECT
column1,
-- The next statement is what *shuffles* all the data
row_number() OVER (ORDER BY random()) AS rn
FROM
dataset2
)
-- You update your dataset1
-- with the shuffled data, linking back to the original keys
UPDATE
dataset1
SET
column4 = shuffled_data.column1
FROM
shuffled_data
JOIN original_keys ON original_keys.rn = shuffled_data.rn
WHERE
dataset1.id = original_keys.id ;
Note that the trick is performed by means of:
row_number() OVER (ORDER BY random()) AS rn
The row_number() window function that produces as many consecutive numbers as there are rows, starting from 1.
These numbers are randomly shuffled because the OVER clause takes all the data and sorts it randomly.
CHECKS
We can check again:
SELECT count(DISTINCT column4) FROM dataset1 ;
| count |
| ----: |
| 20 |
SELECT * FROM dataset1 ;
id | column4
--: | :-------------
101 | SOMETHING 1016
102 | SOMETHING 1009
103 | SOMETHING 1003
...
118 | SOMETHING 1012
119 | SOMETHING 1017
120 | SOMETHING 1011
ALTERNATIVE
Note that this can also be done with subqueries, by simple substitution, instead of CTEs. That might improve performance in some occasions:
UPDATE
dataset1
SET
column4 = shuffled_data.column1
FROM
(SELECT
column1,
row_number() OVER (ORDER BY random()) AS rn
FROM
dataset2
) AS shuffled_data
JOIN
(SELECT
id,
row_number() OVER () AS rn
FROM
dataset1
) AS original_keys ON original_keys.rn = shuffled_data.rn
WHERE
dataset1.id = original_keys.id ;
And again...
SELECT * FROM dataset1;
id | column4
--: | :-------------
101 | SOMETHING 1011
102 | SOMETHING 1018
103 | SOMETHING 1007
...
118 | SOMETHING 1020
119 | SOMETHING 1002
120 | SOMETHING 1016
You can check the whole setup and experiment at dbfiddle here
NOTE: if you do this with very large datasets, don't expect it to be extremely fast. Shuffling a very big deck of cards is expensive.
Case 2: number of rows in dataset1 > rows in dataset2
In this case, values for column4 can be repeated several times.
The easiest possibility I can think of (probably, not an efficient one, but easy to understand) is to create a function random_column1, marked as VOLATILE:
CREATE FUNCTION random_column1()
RETURNS TEXT
VOLATILE -- important!
LANGUAGE SQL
AS
$$
SELECT
column1
FROM
dataset2
ORDER BY
random()
LIMIT
1 ;
$$ ;
And use it to update:
UPDATE
dataset1
SET
column4 = random_column1();
This way, some values from dataset2 might not be used at all, whereas others will be used more than once.
dbfiddle here
Better is to reference the outer table from the subquery. Then the subquery has to be evalued for every row:
update dataset1.test
set column4 = (select
case when dataset1.test.column4 = dataset1.test.column4
then column1 end
from dataset2
order by random()
limit 1
)

Percentile calculation with a window function

I know you can get the average, total, min, and max over a subset of the data using a window function. But is it possible to get, say, the median, or the 25th percentile instead of the average with the window function?
Put another way, how do I rewrite this to get the id and the 25th or 50th percentile sales numbers within each district rather than the average?
SELECT id, avg(sales)
OVER (PARTITION BY district) AS district_average
FROM t
You can write this as an aggregation function using percentile_cont() or percentile_disc():
select district, percentile_cont(0.25) within group (order by sales)
from t
group by district;
Unfortunately, Postgres doesn't currently support these as a window functions:
select id, percentile_cont(0.25) within group (order by sales) over (partition by district)
from t;
So, you can use a join:
select t.*, p_25, p_75
from t join
(select district,
percentile_cont(0.25) within group (order by sales) as p_25,
percentile_cont(0.75) within group (order by sales) as p_75
from t
group by district
) td
on t.district = td.district
Another way to to this without joining as in Gordon's solution is by exploiting the array_agg function which can be used as a window function:
create function pg_temp.percentile_cont_window
(c double precision[], p double precision)
returns double precision
language sql
as
$$
with t1 as (select unnest(c) as x)
select percentile_cont(p) WITHIN GROUP (ORDER BY x) from t1;
$$
;
-- -- -- -- -- -- -- -- --
-- Usage examples:
create temporary table t1 as (
select 1 as g, 1 as x
union select 1 as g, 2 as x
union select 2 as g, 3 as x
);
-- Built-in function raises an error if used without group:
-- Error: OVER is not supported for ordered-set aggregate percentile_cont
select *, percentile_cont(.1) within group (order by x) over() from t1;
-- Built-in function with grouping
select g, percentile_cont(.1) within group (order by x) from t1 group by g;
-- | g | percentile_cont |
-- |----:|------------------:|
-- | 1 | 1.1 |
-- | 2 | 3 |
-- Custom function basic usage (note that this is without grouping)
select t1.*, pg_temp.percentile_cont_window(array_agg(x) over(), .1) from t1;
-- | g | x | percentile_cont_window |
-- |----:|----:|-------------------------:|
-- | 1 | 2 | 1.2 |
-- | 1 | 1 | 1.2 |
-- | 2 | 3 | 1.2 |
-- Custom function usage with grouping is the same as using the built-in percentile_cont function
select t1.g, pg_temp.percentile_cont_window(array_agg(x), .1) from t1 group by g;
-- | g | percentile_cont_window |
-- |----:|-------------------------:|
-- | 2 | 3 |
-- | 1 | 1.1 |

SQL to remove rows with duplicated value while keeping one

Say I have this table
id | data | value
-----------------
1 | a | A
2 | a | A
3 | a | A
4 | a | B
5 | b | C
6 | c | A
7 | c | C
8 | c | C
I want to remove those rows with duplicated value for each data while keeping the one with the min id, e.g. the result will be
id | data | value
-----------------
1 | a | A
4 | a | B
5 | b | C
6 | c | A
7 | c | C
I know a way to do it is to do a union like:
SELECT 1 [id], 'a' [data], 'A' [value] INTO #test UNION SELECT 2, 'a', 'A'
UNION SELECT 3, 'a', 'A' UNION SELECT 4, 'a', 'B'
UNION SELECT 5, 'b', 'C' UNION SELECT 6, 'c', 'A'
UNION SELECT 7, 'c', 'C' UNION SELECT 8, 'c', 'C'
SELECT * FROM #test WHERE id NOT IN (
SELECT MIN(id) FROM #test
GROUP BY [data], [value]
HAVING COUNT(1) > 1
UNION
SELECT MIN(id) FROM #test
GROUP BY [data], [value]
HAVING COUNT(1) <= 1
)
but this solution has to repeat the same group by twice (consider the real case is a massive group by with > 20 columns)
I would prefer a simpler answer with less code as oppose to complex ones. Is there any more concise way to code this?
Thank you
You can use one of the methods below:
Using WITH CTE:
WITH CTE AS
(SELECT *,RN=ROW_NUMBER() OVER(PARTITION BY data,value ORDER BY id)
FROM TableName)
DELETE FROM CTE WHERE RN>1
Explanation:
This query will select the contents of the table along with a row number RN. And then delete the records with RN >1 (which would be the duplicates).
This Fiddle shows the records which are going to be deleted using this method.
Using NOT IN:
DELETE FROM TableName
WHERE id NOT IN
(SELECT MIN(id) as id
FROM TableName
GROUP BY data,value)
Explanation:
With the given example, inner query will return ids (1,6,4,5,7). The outer query will delete records from table whose id NOT IN (1,6,4,5,7).
This fiddle shows the records which are going to be deleted using this method.
Suggestion: Use the first method since it is faster than the latter. Also, it manages to keep only one record if id field is also duplicated for the same data and value.
I want to add MYSQL solution for this query
Suggestion 1 : MySQL prior to version 8.0 doesn't support the WITH clause
Suggestion 2 : throw this error (you can't specify table TableName for update in FROM clause
So the solution will be
DELETE FROM TableName WHERE id NOT IN
(SELECT MIN(id) as id
FROM (select * from TableName) as t1
GROUP BY data,value) as t2;

PostgreSQL: select nearest rows according to sort order

I have a table like this:
a | user_id
----------+-------------
0.1133 | 2312882332
4.3293 | 7876123213
3.1133 | 2312332332
1.3293 | 7876543213
0.0033 | 2312222332
5.3293 | 5344343213
3.2133 | 4122331112
2.3293 | 9999942333
And I want to locate a particular row - 1.3293 | 7876543213 for example - and select the nearest 4 rows. 2 above, 2 below if possible.
Sort order is ORDER BY a ASC.
In this case I will get:
0.0033 | 2312222332
0.1133 | 2312882332
2.3293 | 9999942333
3.1133 | 2312332332
How can I achieve this using PostgreSQL? (BTW, I'm using PHP.)
P.S.: For the last or first row the nearest rows would be 4 above or 4 below.
Test case:
CREATE TEMP TABLE tbl(a float, user_id bigint);
INSERT INTO tbl VALUES
(0.1133, 2312882332)
,(4.3293, 7876123213)
,(3.1133, 2312332332)
,(1.3293, 7876543213)
,(0.0033, 2312222332)
,(5.3293, 5344343213)
,(3.2133, 4122331112)
,(2.3293, 9999942333);
Query:
WITH x AS (
SELECT a
,user_id
,row_number() OVER (ORDER BY a, user_id) AS rn
FROM tbl
), y AS (
SELECT rn, LEAST(rn - 3, (SELECT max(rn) - 5 FROM x)) AS min_rn
FROM x
WHERE (a, user_id) = (1.3293, 7876543213)
)
SELECT *
FROM x, y
WHERE x.rn > y.min_rn
AND x.rn <> y.rn
ORDER BY x.a, x.user_id
LIMIT 4;
Returns result as depicted in the question. Assuming that (a, user_id) is unique.
It is not clear whether a is supposed to unique. That's why I sort by user_id additionally to break ties. That's also why I use the window function row_number(), an not rank() for this. row_number() is the correct tool in any case. We want 4 rows. rank() would give an undefined number of rows if there were peers in the sort order.
This always returns 4 rows as long as there are at least 5 rows in the table. Close to first / last row, the first / last 4 rows are returned. The two rows before / after in all other cases. The criteria row itself is excluded.
Improved performance
This is an improved version of what #Tim Landscheidt posted. Vote for his answer if you like the idea with the index. Don't bother with small tables. But will boost performance for big tables - provided you have a fitting index in place. Best choice would be a multicolumn index on (a, user_id).
WITH params(_a, _user_id) AS (SELECT 5.3293, 5344343213) -- enter params once
,x AS (
(
SELECT a
,user_id
,row_number() OVER (ORDER BY a DESC, user_id DESC) AS rn
FROM tbl, params p
WHERE a < p._a
OR a = p._a AND user_id < p._user_id -- a is not defined unique
ORDER BY a DESC, user_id DESC
LIMIT 5 -- 4 + 1: including central row
)
UNION ALL -- UNION right away, trim one query level
(
SELECT a
,user_id
,row_number() OVER (ORDER BY a ASC, user_id ASC) AS rn
FROM tbl, params p
WHERE a > p._a
OR a = p._a AND user_id > p._user_id
ORDER BY a ASC, user_id ASC
LIMIT 5
)
)
, y AS (
SELECT a, user_id
FROM x, params p
WHERE (a, user_id) <> (p._a, p._user_id) -- exclude central row
ORDER BY rn -- no need to ORDER BY a
LIMIT 4
)
SELECT *
FROM y
ORDER BY a, user_id -- ORDER result as requested
Major differences to #Tim's version:
According to the question (a, user_id) form the search criteria, not just a. That changes window frame, ORDER BY and WHERE clause in subtly different ways.
UNION right away, no need for an extra query level. You need parenthesis around the two UNION-queries to allow for individual ORDER BY.
Sort result as requested. Requires another query level (at hardly any cost).
As parameters are used in multiple places I centralized the input in a leading CTE.
For repeated use you can wrap this query almost 'as is' into an SQL or plpgsql function.
And another one:
WITH prec_rows AS
(SELECT a,
user_id,
ROW_NUMBER() OVER (ORDER BY a DESC) AS rn
FROM tbl
WHERE a < 1.3293
ORDER BY a DESC LIMIT 4),
succ_rows AS
(SELECT a,
user_id,
ROW_NUMBER() OVER (ORDER BY a ASC) AS rn
FROM tbl
WHERE a > 1.3293
ORDER BY a ASC LIMIT 4)
SELECT a, user_id
FROM
(SELECT a,
user_id,
rn
FROM prec_rows
UNION ALL SELECT a,
user_id,
rn
FROM succ_rows) AS s
ORDER BY rn, a LIMIT 4;
AFAIR WITH will instantiate a memory table, so the focus of this solution is to limit its size as much as possible (in this case eight rows).
set search_path='tmp';
DROP TABLE lutser;
CREATE TABLE lutser
( val float
, num bigint
);
INSERT INTO lutser(val, num)
VALUES ( 0.1133 , 2312882332 )
,( 4.3293 , 7876123213 )
,( 3.1133 , 2312332332 )
,( 1.3293 , 7876543213 )
,( 0.0033 , 2312222332 )
,( 5.3293 , 5344343213 )
,( 3.2133 , 4122331112 )
,( 2.3293 , 9999942333 )
;
WITH ranked_lutsers AS (
SELECT val, num
,rank() OVER (ORDER BY val) AS rnk
FROM lutser
)
SELECT that.val, that.num
, (that.rnk-this.rnk) AS relrnk
FROM ranked_lutsers that
JOIN ranked_lutsers this ON (that.rnk BETWEEN this.rnk-2 AND this.rnk+2)
WHERE this.val = 1.3293
;
Results:
DROP TABLE
CREATE TABLE
INSERT 0 8
val | num | relrnk
--------+------------+--------
0.0033 | 2312222332 | -2
0.1133 | 2312882332 | -1
1.3293 | 7876543213 | 0
2.3293 | 9999942333 | 1
3.1133 | 2312332332 | 2
(5 rows)
As Erwin pointed out, the center row is not wanted in the output. Also, the row_number() should be used instead of rank().
WITH ranked_lutsers AS (
SELECT val, num
-- ,rank() OVER (ORDER BY val) AS rnk
, row_number() OVER (ORDER BY val, num) AS rnk
FROM lutser
) SELECT that.val, that.num
, (that.rnk-this.rnk) AS relrnk
FROM ranked_lutsers that
JOIN ranked_lutsers this ON (that.rnk BETWEEN this.rnk-2 AND this.rnk+2 )
WHERE this.val = 1.3293
AND that.rnk <> this.rnk
;
Result2:
val | num | relrnk
--------+------------+--------
0.0033 | 2312222332 | -2
0.1133 | 2312882332 | -1
2.3293 | 9999942333 | 1
3.1133 | 2312332332 | 2
(4 rows)
UPDATE2: to always select four, even if we are at the top or bottom of the list. This makes the query a bit uglier. (but not as ugly as Erwin's ;-)
WITH ranked_lutsers AS (
SELECT val, num
-- ,rank() OVER (ORDER BY val) AS rnk
, row_number() OVER (ORDER BY val, num) AS rnk
FROM lutser
) SELECT that.val, that.num
, ABS(that.rnk-this.rnk) AS srtrnk
, (that.rnk-this.rnk) AS relrnk
FROM ranked_lutsers that
JOIN ranked_lutsers this ON (that.rnk BETWEEN this.rnk-4 AND this.rnk+4 )
-- WHERE this.val = 1.3293
WHERE this.val = 0.1133
AND that.rnk <> this.rnk
ORDER BY srtrnk ASC
LIMIT 4
;
Output:
val | num | srtrnk | relrnk
--------+------------+--------+--------
0.0033 | 2312222332 | 1 | -1
1.3293 | 7876543213 | 1 | 1
2.3293 | 9999942333 | 2 | 2
3.1133 | 2312332332 | 3 | 3
(4 rows)
UPDATE: A version with a nested CTE (featuring outer join!!!). For conveniance, I added a primary key to the table, which sounds like a good idea anyway IMHO.
WITH distance AS (
WITH ranked_lutsers AS (
SELECT id
, row_number() OVER (ORDER BY val, num) AS rnk
FROM lutser
) SELECT l0.id AS one
,l1.id AS two
, ABS(l1.rnk-l0.rnk) AS dist
-- Warning: Cartesian product below
FROM ranked_lutsers l0
, ranked_lutsers l1 WHERE l0.id <> l1.id
)
SELECT lu.*
FROM lutser lu
JOIN distance di
ON lu.id = di.two
WHERE di.one= 1
ORDER by di.dist
LIMIT 4
;