How can i count elements while aggregate in clickhouse - aggregate

I have a data like this
customer_id - product_id
1 - 10
1 - 11
1 - 12
1 - 11
2 - 15
2 - 20
After aggregating i want to get:
customer_id - product_id
1 - {10:1, 11:2, 12:1}
2 - {15:1, 20:1}

Try this query:
SELECT
customer_id,
arrayMap((product_id, count) -> (product_id, count),
untuple(sumMap([product_id], [1]))) AS result
FROM
(
/* Emulate the test dataset. */
SELECT
data.1 AS customer_id,
data.2 AS product_id
FROM
(
SELECT arrayJoin([(1, 10), (1, 11), (1, 12), (1, 11),
(2, 15), (2, 20)]) AS data
)
)
GROUP BY customer_id
/*
┌─customer_id─┬─result─────────────────┐
│ 1 │ [(10,1),(11,2),(12,1)] │
│ 2 │ [(15,1),(20,1)] │
└─────────────┴────────────────────────┘
*/

WITH dataset AS
(
SELECT
data.1 AS customer_id,
data.2 AS product_id
FROM
(
SELECT arrayJoin([
(1, 10), (1, 11), (1, 12), (1, 11), (2, 15), (2, 20)
]) AS data
)
)
SELECT
customer_id,
arrayMap(
x -> (x, arrayCount(y -> (y = x), groupArray(product_id) AS product_ids)),
arrayDistinct(product_ids)
) AS result
FROM dataset
GROUP BY customer_id
┌─customer_id─┬─result─────────────────┐
│ 1 │ [(10,1),(11,2),(12,1)] │
│ 2 │ [(15,1),(20,1)] │
└─────────────┴────────────────────────┘

Related

when / then / otherwise with values from numpy array

Say I have
df = pl.DataFrame({'group': [1, 1, 1, 3, 3, 3, 4, 4]})
I have a numpy array of values, which I'd like to replace 'group' 3 with
values = np.array([9, 8, 7])
Here's what I've tried:
(
df
.with_column(
pl.when(pl.col('group')==3)
.then(values)
.otherwise(pl.col('group')
).alias('group')
)
In [4]: df.with_column(pl.when(pl.col('group')==3).then(values).otherwise(pl.col('group')).alias('group'))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [4], line 1
----> 1 df.with_column(pl.when(pl.col('group')==3).then(values).otherwise(pl.col('group')).alias('group'))
File ~/tmp/.venv/lib/python3.8/site-packages/polars/internals/whenthen.py:132, in When.then(self, expr)
111 def then(
112 self,
113 expr: (
(...)
121 ),
122 ) -> WhenThen:
123 """
124 Values to return in case of the predicate being `True`.
125
(...)
130
131 """
--> 132 expr = pli.expr_to_lit_or_expr(expr)
133 pywhenthen = self._pywhen.then(expr._pyexpr)
134 return WhenThen(pywhenthen)
File ~/tmp/.venv/lib/python3.8/site-packages/polars/internals/expr/expr.py:118, in expr_to_lit_or_expr(expr, str_to_lit)
116 return expr.otherwise(None)
117 else:
--> 118 raise ValueError(
119 f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with"
120 " pl.lit or pl.col"
121 )
ValueError: did not expect value [9 8 7] of type <class 'numpy.ndarray'>, maybe disambiguate with pl.lit or pl.col
How can I do this correctly?
A few things to consider.
One is that you always should convert your numpy arrays to polars Series as we will use the arrow memory specification underneath and not numpys.
Second is that when -> then -> otherwise operates on columns that are of equal length. We nudge the API in such a direction that you define a logical statement based of columns in your DataFrame and therefore you should not know the indices (nor the lenght of a group) that you want to replace. This allows for much optimizations because if you do not define indices to replace we can push down a filter before that expression.
Anyway, your specific situation does know the length of the group, so we must use something different. We can first compute the indices where the conditional holds and then modify based on those indices.
df = pl.DataFrame({
"group": [1, 1, 1, 3, 3, 3, 4, 4]
})
values = np.array([9, 8, 7])
# compute indices of the predicate
idx = df.select(
pl.arg_where(pl.col("group") == 3)
).to_series()
# mutate on those locations
df.with_column(
df["group"].set_at_idx(idx, pl.Series(values))
)
Here's all I could come up with
df.with_column(
pl.col("group")
.cumcount()
.over(pl.col("group"))
.alias("idx")
).apply(
lambda x: values[x[1]] if x[0] == 3 else x[0]
).select(
pl.col("apply").alias("group")
)
Surely there's a simpler way?
In [28]: df.with_column(pl.col('group').cumcount().over(pl.col('group')).alias('idx')).apply(lambda x:
...: values[x[1]] if x[0] == 3 else x[0]).select(pl.col('apply').alias('group'))
Out[28]:
shape: (8, 1)
┌───────┐
│ group │
│ --- │
│ i64 │
╞═══════╡
│ 1 │
├╌╌╌╌╌╌╌┤
│ 1 │
├╌╌╌╌╌╌╌┤
│ 1 │
├╌╌╌╌╌╌╌┤
│ 9 │
├╌╌╌╌╌╌╌┤
│ 8 │
├╌╌╌╌╌╌╌┤
│ 7 │
├╌╌╌╌╌╌╌┤
│ 4 │
├╌╌╌╌╌╌╌┤
│ 4 │
└───────┘

how can I use lateral join to flatten the jsonb in postgres

I have data like this which I need to flatten for each Id with the corresponding key and size with two different columns.
So I was watching the tutorial on snowflake which has this function
select distinct json.key as column_name,
from raw.public.table_name,
lateral flatten(input => table_name) json
I was trying to find something in postgres query
id | json_data
1 | {"KEY": "mekq1232314342134434", "size": 0}
2 | {"KEY": "meksaq12323143421344", "size": 2}
3 | {"KEY": "meksaq12323324421344", "size": 3}
So I need two things here first I need a distinct key from these jsonb columns,
2.
I need to flatten the jsonb columns
id | kEY | size
1 | mekq1232314342134434 | 0
Another option, beside ->>, would be to use jsonb_to_record:
with
sample_data (id, json) as (values
(1, '{"KEY": "mekq1232314342134434", "size": 0}' :: jsonb),
(2, '{"KEY": "meksaq12323143421344", "size": 2}' :: jsonb),
(3, '{"KEY": "meksaq12323324421344", "size": 3}' :: jsonb)
)
select
id, "KEY", size
from
sample_data,
lateral jsonb_to_record(sample_data.json) as ("KEY" text, size int);
-- `lateral` is optional in this case
┌────┬──────────────────────┬──────┐
│ id │ KEY │ size │
├────┼──────────────────────┼──────┤
│ 1 │ mekq1232314342134434 │ 0 │
│ 2 │ meksaq12323143421344 │ 2 │
│ 3 │ meksaq12323324421344 │ 3 │
└────┴──────────────────────┴──────┘
(3 rows)

How to set values from recursive query in PostgreSQL?

I have a query which gives a result:
id | manager_id | level | star_level
----+------------+-------+------------
1 | NULL | 1 | 0
2 | 1 | 2 | 1
3 | 2 | 3 | 1
4 | 3 | 4 | 2
5 | 4 | 5 | 2
6 | 5 | 6 | 2
7 | 6 | 7 | 3
8 | 7 | 8 | 3
9 | 8 | 9 | 4
(9 rows)
Here is the query:
WITH RECURSIVE parents AS (
SELECT e.id
, e.manager_id
, 1 AS level
, CAST(s.is_star AS INTEGER) AS star_level
FROM employees AS e
INNER JOIN skills AS s
ON e.skill_id = s.id
WHERE manager_id IS NULL
UNION ALL
SELECT e.id
, e.manager_id
, p.level + 1 AS level
, p.star_level + CAST(s.is_star AS INTEGER) AS star_level
FROM employees AS e
INNER JOIN skills AS s
ON e.skill_id = s.id
INNER JOIN parents AS p
ON e.manager_id = p.id
WHERE e.manager_id = p.id
)
SELECT *
FROM parents
;
Can you please tell me how you can change the query so that in the same query the level and star_level values ​​can be written to the corresponding columns?
Demo data:
create table Employees(
id INT,
name VARCHAR,
manager_id INT,
skill_id INT,
level INT,
star_level INT
);
create table Skills(
id INT,
name VARCHAR,
is_star BOOL
);
INSERT INTO Employees
(id, name, manager_id, skill_id)
VALUES
(1, 'Employee 1', NULL, 1),
(2, 'Employee 2', 1, 2),
(3, 'Employee 3', 2, 3),
(4, 'Employee 4', 3, 4),
(5, 'Employee 5', 4, 5),
(6, 'Employee 6', 5, 1),
(7, 'Employee 7', 6, 2),
(8, 'Employee 8', 7, 3),
(9, 'Employee 9', 8, 4)
;
INSERT INTO Skills
(id, name, is_star)
VALUES
(1, 'Skill 1', FALSE),
(2, 'Skill 2', TRUE),
(3, 'Skill 3', FALSE),
(4, 'Skill 4', TRUE),
(5, 'Skill 5', FALSE)
;
As a result, I need a query which will count level and star_level columns for Employees table and write their values (in Employees table) in one query.
You can use an UPDATE statement together with your CTE:
with recursive parents as (
... your original query goes here ...
)
update employees
set level = p.level,
star_level = p.star_level
from parents p
where employees.id = p.id;

PostgreSQL: detecting the first/last rows of result set

Is there any way to embed a flag in a select that indicates that it is the first or the last row of a result set? I'm thinking something to the effect of:
> SELECT is_first_row() AS f, is_last_row() AS l FROM blah;
f | l
-----------
t | f
f | f
f | f
f | f
f | t
The answer might be in window functions but I've only just learned about them, and I question their efficiency.
SELECT first_value(unique_column) OVER () = unique_column, last_value(unique_column) OVER () = unique_column, * FROM blah;
seems to do what I want. Unfortunately, I don't even fully understand that syntax, but since unique_column is unique and NOT NULL it should deliver unambiguous results. But if it does sorting, then the cure might be worse than the disease. (Actually, in my tests, unique_column is not sorted, so that's something.)
EXPLAIN ANALYZE doesn't indicate there's an efficiency problem, but when has it ever told me what I needed to know?
And I might need to use this in an aggregate function, but I've just been told window functions aren't allowed there. 😕
Edit:
Actually, I just added ORDER BY unique_column to the above query and the rows identified as first and last were thrown into the middle of the result set. It's as if first_value()/last_value() really means "the first/last value I picked up before I began sorting." I don't think I can safely do this optimally. Not unless a much better understanding of the use of the OVER keyword is to be had.
I'm running PostgreSQL 9.6 in a Debian 9.5 environment.
This isn't a duplicate, because I'm trying to get the first row and last row of the result set to identify themselves, while Postgres: get min, max, aggregate values in one select is just going for the minimum and maximum values for a column in a result set.
You can use the lead() and lag() window functions (over the appropiate window) and compare them to NULL:
-- \i tmp.sql
CREATE TABLE ztable
( id SERIAL PRIMARY KEY
, starttime TIMESTAMP
);
INSERT INTO ztable (starttime) VALUES ( now() - INTERVAL '1 minute');
INSERT INTO ztable (starttime) VALUES ( now() - INTERVAL '2 minute');
INSERT INTO ztable (starttime) VALUES ( now() - INTERVAL '3 minute');
INSERT INTO ztable (starttime) VALUES ( now() - INTERVAL '4 minute');
INSERT INTO ztable (starttime) VALUES ( now() - INTERVAL '5 minute');
INSERT INTO ztable (starttime) VALUES ( now() - INTERVAL '6 minute');
SELECT id, starttime
, ( lead(id) OVER www IS NULL) AS is_first
, ( lag(id) OVER www IS NULL) AS is_last
FROM ztable
WINDOW www AS (ORDER BY id )
ORDER BY id
;
SELECT id, starttime
, ( lead(id) OVER www IS NULL) AS is_first
, ( lag(id) OVER www IS NULL) AS is_last
FROM ztable
WINDOW www AS (ORDER BY starttime )
ORDER BY id
;
SELECT id, starttime
, ( lead(id) OVER www IS NULL) AS is_first
, ( lag(id) OVER www IS NULL) AS is_last
FROM ztable
WINDOW www AS (ORDER BY starttime )
ORDER BY random()
;
Result:
INSERT 0 1
INSERT 0 1
INSERT 0 1
INSERT 0 1
INSERT 0 1
INSERT 0 1
id | starttime | is_first | is_last
----+----------------------------+----------+---------
1 | 2018-08-31 18:38:45.567393 | f | t
2 | 2018-08-31 18:37:45.575586 | f | f
3 | 2018-08-31 18:36:45.587436 | f | f
4 | 2018-08-31 18:35:45.592316 | f | f
5 | 2018-08-31 18:34:45.600619 | f | f
6 | 2018-08-31 18:33:45.60907 | t | f
(6 rows)
id | starttime | is_first | is_last
----+----------------------------+----------+---------
1 | 2018-08-31 18:38:45.567393 | t | f
2 | 2018-08-31 18:37:45.575586 | f | f
3 | 2018-08-31 18:36:45.587436 | f | f
4 | 2018-08-31 18:35:45.592316 | f | f
5 | 2018-08-31 18:34:45.600619 | f | f
6 | 2018-08-31 18:33:45.60907 | f | t
(6 rows)
id | starttime | is_first | is_last
----+----------------------------+----------+---------
2 | 2018-08-31 18:37:45.575586 | f | f
4 | 2018-08-31 18:35:45.592316 | f | f
6 | 2018-08-31 18:33:45.60907 | f | t
5 | 2018-08-31 18:34:45.600619 | f | f
1 | 2018-08-31 18:38:45.567393 | t | f
3 | 2018-08-31 18:36:45.587436 | f | f
(6 rows)
[updated: added a randomly sorted case]
It is simple using window functions with particular frames:
with t(x, y) as (select generate_series(1,5), random())
select *,
count(*) over (rows between unbounded preceding and current row),
count(*) over (rows between current row and unbounded following)
from t;
┌───┬───────────────────┬───────┬───────┐
│ x │ y │ count │ count │
├───┼───────────────────┼───────┼───────┤
│ 1 │ 0.543995119165629 │ 1 │ 5 │
│ 2 │ 0.886343683116138 │ 2 │ 4 │
│ 3 │ 0.124682310037315 │ 3 │ 3 │
│ 4 │ 0.668972567655146 │ 4 │ 2 │
│ 5 │ 0.266671542543918 │ 5 │ 1 │
└───┴───────────────────┴───────┴───────┘
As you can see count(*) over (rows between unbounded preceding and current row) returns rows count from the data set beginning to current row and count(*) over (rows between current row and unbounded following) returns rows count from the current to data set end. 1 indicates the first/last rows.
It works until you ordering your data set by order by. In this case you need to duplicate it in the frames definitions:
with t(x, y) as (select generate_series(1,5), random())
select *,
count(*) over (order by y rows between unbounded preceding and current row),
count(*) over (order by y rows between current row and unbounded following)
from t order by y;
┌───┬───────────────────┬───────┬───────┐
│ x │ y │ count │ count │
├───┼───────────────────┼───────┼───────┤
│ 1 │ 0.125781774986535 │ 1 │ 5 │
│ 4 │ 0.25046408502385 │ 2 │ 4 │
│ 5 │ 0.538880597334355 │ 3 │ 3 │
│ 3 │ 0.802807193249464 │ 4 │ 2 │
│ 2 │ 0.869908029679209 │ 5 │ 1 │
└───┴───────────────────┴───────┴───────┘
PS: As mentioned by a_horse_with_no_name in the comment:
there is no such thing as the "first" or "last" row without sorting.
In fact, Window Functions are a great approach and for that requirement of yours, they are awesome.
Regarding efficiency, window functions work over the data set already at hand. Which means the DBMS will just add extra processing to infer first/last values.
Just one thing I'd like to suggest: I like to put an ORDER BY criteria inside the OVER clause, just to ensure the data set order is the same between multiple executions, thus returning the same values to you.
Try using
SELECT columns
FROM mytable
Join conditions
WHERE conditions ORDER BY date DESC LIMIT 1
UNION ALL
SELECT columns
FROM mytable
Join conditions
WHERE conditions ORDER BY date ASC LIMIT 1
SELECT just cut half of the processing time. You can go for indexing also.

Cumulative count on history table with deleted attributes

I've got a history table of updates to records, and I want to calculate cumulative totals where values may be added or deleted to the set. (ie the cumulative total for one month may be less than the previous).
For example, here's a table with the history of updates to tags for a person record. (id is the id of the person record).
I want to count how many people had the "established" tag in any given month, accounting for when it was added or removed in a prior month.
+----+------------------------+---------------------+
| id | tags | created_at |
+----+------------------------+---------------------+
| 1 | ["vip", "established"] | 2017-01-01 00:00:00 |
| 2 | ["established"] | 2017-01-01 00:00:00 |
| 3 | ["established"] | 2017-02-01 00:00:00 |
| 1 | ["vip"] | 2017-03-01 00:00:00 |
| 4 | ["established"] | 2017-05-01 00:00:00 |
+----+------------------------+---------------------+
With some help from these posts, I've gotten this far:
SELECT
item_month,
sum(count(distinct(id))) OVER (ORDER BY item_month)
FROM (
SELECT
to_char("created_at", 'yyyy-mm') as item_month,
id
FROM person_history
WHERE tags ? 'established'
) t1
GROUP BY item_month;
Which gives me:
month count
2017-01 2
2017-02 3
2017-05 4 <--- should be 3
And it's also missing an entry for 2017-03 which should be 2.
(An entry for 2017-04 would be nice too, but the UI could always infer it from the previous month if need be)
Here is step-by-step tutorial, you could try to collapse all those CTEs:
with
-- Example data
person_history(id, tags, created_at) as (values
(1, '["vip", "est"]'::jsonb, '2017-01-01'::timestamp),
(2, '["est"]', '2017-01-01'), -- Note that Person 2 changed its tags several times per month
(2, '["vip"]', '2017-01-02'),
(2, '["vip", "est"]', '2017-01-03'),
(3, '["est"]', '2017-02-01'),
(1, '["vip"]', '2017-03-01'),
(4, '["est"]', '2017-05-01')),
-- Get the last tags for each person per month
monthly as (
select distinct on (id, date_trunc('month', created_at))
id,
date_trunc('month', created_at) as month,
tags,
created_at
from person_history
order by 1, 2, created_at desc),
-- Retrieve tags from previous month
monthly_prev as (
select
*,
coalesce((lag(tags) over (partition by id order by month)), '[]') as prev_tags
from monthly),
-- Calculate delta: if "est" was added then 1, removed then -1, nothing heppens then 0
monthly_delta as (
select
*,
case
when tags ? 'est' and not prev_tags ? 'est' then 1
when not tags ? 'est' and prev_tags ? 'est' then -1
else 0
end as delta
from monthly_prev),
-- Sum all deltas for each month
monthly_total as (
select month, sum(delta) as total
from monthly_delta
group by month)
-- Finally calculate cumulative sum
select *, sum(total) over (order by month) from monthly_total
order by month;
Result:
┌─────────────────────┬───────┬─────┐
│ month │ total │ sum │
├─────────────────────┼───────┼─────┤
│ 2017-01-01 00:00:00 │ 2 │ 2 │
│ 2017-02-01 00:00:00 │ 1 │ 3 │
│ 2017-03-01 00:00:00 │ -1 │ 2 │
│ 2017-05-01 00:00:00 │ 1 │ 3 │
└─────────────────────┴───────┴─────┘