[Error 10025]: Line 5:3 Expression not in GROUP BY key 'dotProductSum' - group-by

DROP TABLE IF EXISTS recommendations;
CREATE TABLE recommendations AS
SELECT
movie_id AS movie_id,
movie_id_2 AS movie_id_2,
(N * dotProductSum - ratingSum * rating2Sum) / (SQRT(N * ratingSqSum - ratingSum * ratingSum) * SQRT(N * rating2SqSum - rating2Sum * rating2Sum))
AS correlation
FROM
coratings
GROUP BY
coratings.movie_id, coratings.movie_id_2, coratings.dotProductSum
ORDER BY
movie_id;
i have added dotProductSum into query still i get below error.
org.apache.hive.service.cli.HiveSQLException: Error while compiling statement: FAILED: SemanticException [Error 10025]: Line 5:3 Expression not in GROUP BY key 'dotProductSum'
below one is the coratings table
CREATE TABLE coratings AS
SELECT
a.movie_id ,
b.movie_id AS movie_id_2 ,
COUNT(*) AS N ,
SUM(a.rating) AS ratingSum ,
SUM(b.rating) AS rating2Sum ,
SUM(a.rating * b.rating) AS dotProductSum ,
SUM(a.rating * a.rating) AS ratingSqSum ,
SUM(b.rating * b.rating) AS rating2SqSum
FROM
movies_ratings a
JOIN movies_ratings b ON a.user_id = b.user_id
WHERE
a.movie_id < b.movie_id
GROUP BY
a.movie_id, b.movie_id
HAVING
N >= 30;
movies_ratings table
USE rec_engine;
-- table definition will be created
DROP TABLE IF EXISTS movies_ratings ;
-- here
CREATE TABLE movies_ratings ( user_id INT, movie_id INT, rating INT, `time_stamp` INT )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"field.delim"=",",
"skip.header.line.count"="1"
);
-- data in local file system /home/hive folder. otherwise can't load.
LOAD DATA LOCAL INPATH 'ratings_small.csv'
OVERWRITE INTO TABLE movies_ratings;

Related

Query for distinct set of non-overlapping ranges in PostgreSQL

Given a column of overlapping and/or discontinuous ranges:
WITH tbl (active_dates) AS
(
VALUES
('["2015-05-21","2018-10-01")'::TSRANGE),
('["2016-08-13","2018-09-01")'::TSRANGE),
('["2019-03-01","2019-05-01")'::TSRANGE)
)
SELECT *
FROM tbl;
How can we generate an output that identifies all the discrete time periods like so:
active_dates
------------
["2015-05-21 00:00:00","2016-08-13 00:00:00")
["2016-08-13 00:00:00","2018-09-01 00:00:00")
["2018-09-01 00:00:00","2018-10-01 00:00:00")
["2019-03-01 00:00:00","2018-05-01 00:00:00")
As always, you can do that with window functions:
WITH tbl (active_dates) AS
(
VALUES
('["2015-05-21","2018-10-01")'::TSRANGE),
('["2016-08-13","2018-09-01")'::TSRANGE),
('["2019-03-01","2019-05-01")'::TSRANGE)
),
/* get all time points where something changes */
points AS (
SELECT upper(active_dates) AS p
FROM tbl
UNION SELECT lower(active_dates)
FROM tbl
),
/*
* Get all date ranges between these time points.
* The first time range will start with NULL,
* but that will be excluded in the next CTE anyway.
*/
inter AS (
SELECT tsrange(
lag(p) OVER (ORDER BY p),
p
) i
FROM points
)
/*
* Get all date ranges that are contained
* in at least one of the intervals.
*/
SELECT DISTINCT i
FROM inter
CROSS JOIN tbl
WHERE i <# active_dates
ORDER BY i;

postgresql: How to grab an existing id from a not subsequent ids of a table

Postgresql version 9.4
I have a table with an integer column, which has a number of integers with some gaps, like the sample below; I'm trying to get an existing id from the column at random with the following query, but it returns NULL occasionally:
CREATE TABLE
IF NOT EXISTS test_tbl(
id INTEGER);
INSERT INTO test_tbl
VALUES (10),
(13),
(14),
(16),
(18),
(20);
-------------------------------
SELECT * FROM test_tbl;
-------------------------------
SELECT COALESCE(tmp.id, 20) AS classification_id
FROM (
SELECT tt.id,
row_number() over(
ORDER BY tt.id) AS row_num
FROM test_tbl tt
) tmp
WHERE tmp.row_num =floor(random() * 10);
Please let me know where I'm doing wrong.
but it returns NULL occasionally
and I must add to this that it sometimes returns more than 1 rows, right?
in your sample data there are 6 rows, so the column row_num will have a value from 1 to 6.
This:
floor(random() * 10)
creates a random number from 0 up to 0.9999...
You should use:
floor(random() * 6 + 1)::int
to get a random integer from 1 to 6.
But this would not solve the problem, because the WHERE clause is executed once for each row, so there is a case that row_num will never match the created random number, so it will return nothing, or it will match more than once so it will return more than 1 rows.
See the demo.
The proper (although sometimes not the most efficient) way to get a random row is:
SELECT id FROM test_tbl ORDER BY random() LIMIT 1
Also check other links from SO, like:
quick random row selection in Postgres
You could select one row and order by random(), this way you are ensured to hit an existing row
select id
from test_tbl
order by random()
LIMIT 1;

PostgreSQL Removing duplicates

I am working on postgres query to remove duplicates from a table. The following table is dynamically generated and I want to write a select query which will remove the record if the first row has duplicate values.
The table looks something like this
Ist col 2nd col
4 62
6 34
5 26
5 12
I want to write a select query which remove either row 3 or 4.
There is no need for an intermediate table:
delete from df1
where ctid not in (select min(ctid)
from df1
group by first_column);
If you are deleting many rows from a large table, the approach with an intermediate table is probably faster.
If you just want to get unique values for one column, you can use:
select distinct on (first_column) *
from the_table
order by first_column;
Or simply
select first_column, min(second_column)
from the_table
group by first_column;
select count(first) as cnt, first, second
from df1
group by first
having(count(first) = 1)
if you want to keep one of the rows (sorry, I initially missed it if you wanted that):
select first, min(second)
from df1
group by first
Where the table's name is df1 and the columns are named first and second.
You can actually leave off the count(first) as cnt if you want.
At the risk of stating the obvious, once you know how to select the data you want (or don't want) the delete the records any of a dozen ways is simple.
If you want to replace the table or make a new table you can just use create table as for the deletion:
create table tmp as
select count(first) as cnt, first, second
from df1
group by first
having(count(first) = 1);
drop table df1;
create table df1 as select * from tmp;
or using DELETE FROM:
DELETE FROM df1 WHERE first NOT IN (SELECT first FROM tmp);
You could also use select into, etc, etc.
if you want to SELECT unique rows:
SELECT * FROM ztable u
WHERE NOT EXISTS ( -- There is no other record
SELECT * FROM ztable x
WHERE x.id = u.id -- with the same id
AND x.ctid < u.ctid -- , but with a different(lower) "internal" rowid
); -- so u.* must be unique
if you want to SELECT the other rows, which were suppressed in the previous query:
SELECT * FROM ztable nu
WHERE EXISTS ( -- another record exists
SELECT * FROM ztable x
WHERE x.id = nu.id -- with the same id
AND x.ctid < nu.ctid -- , but with a different(lower) "internal" rowid
);
if you want to DELETE records, making the table unique (but keeping one record per id):
DELETE FROM ztable d
WHERE EXISTS ( -- another record exists
SELECT * FROM ztable x
WHERE x.id = d.id -- with the same id
AND x.ctid < d.ctid -- , but with a different(lower) "internal" rowid
);
So basically I did this
create temp t1 as
select first, min (second) as second
from df1
group by first
select * from df1
inner join t1 on t1.first = df1.first and t1.second = df1.second
Its a satisfactory answer. Thanks for your help #Hack-R

Postgresql - get closest datetime row relative to given datetime value

I have a postgres table with a unique datetime field.
I would like to use/create a function that takes as argument a datetime value and returns the row id having the closest datetime relative (but not equal) to the passed datetime value. A second argument could specify before or after the passed value.
Ideally, some combination of native datetime functions could handle this requirement. Otherwise it'll have to be a custom function.
Question: What are methods for querying relative datetime over a collection of rows?
select id, passed_ts - ts_column difference
from t
where
passed_ts > ts_column and positive_interval
or
passed_ts < ts_column and not positive_interval
order by abs(extract(epoch from passed_ts - ts_column))
limit 1
passed_ts is the timestamp parameter and positive_interval is a boolean parameter. If true only rows where the timestamp column is lower then the passed timestamp. If false the inverse.
use simply -.
Assuming you have a table with attributes Key, Attr and T (timestamp with or without timezone):
you can search with
select min(T - TimeValue) from Table where (T - TimeValue) > 0;
this will give you the main difference. You can combine this value with a join to the same table to get the tuple you are interested in:
select * from (select *, T - TimeValue as diff from Table) as T1 NATURAL JOIN
( select min(T - TimeValue) as diff from Table where (T - TimeValue) > 0) as T2;
that should do it
--dmg
You want the first row of a select statement producing all the rows below (or above) the given datetime in descending (or ascending) order.
Pseudo code for the function body:
SELECT id
FROM table
WHERE IF(#above, datecol < #param, datecol > #param)
ORDER BY IF (#above. datecol ASC, datecol DESC)
LIMIT 1
However, this does not work: one cannot condition the ordering direction.
The second idea is to do both queries, and select afterwards:
SELECT *
FROM (
(
SELECT 'below' AS dir, id
FROM table
WHERE datecol < #param
ORDER BY datecol DESC
LIMIT 1
) UNION (
SELECT 'above' AS dir, id
FROM table
WHERE datecol > #param
ORDER BY datecol ASC
LIMIT 1)
) AS t
WHERE dir = #dir
That should be pretty fast with an index on the datetime column.
-- test rig
DROP SCHEMA tmp CASCADE;
CREATE SCHEMA tmp ;
SET search_path=tmp;
CREATE TABLE lutser
( dt timestamp NOT NULL PRIMARY KEY
);
-- populate it
INSERT INTO lutser(dt)
SELECT gs
FROM generate_series('2013-04-30', '2013-05-01', '1 min'::interval) gs
;
DELETE FROM lutser WHERE random() < 0.9;
--
-- The query:
WITH xyz AS (
SELECT dt AS hh
, LAG (dt) OVER (ORDER by dt ) AS ll
FROM lutser
)
SELECT *
FROM xyz bb
WHERE '2013-04-30 12:00' BETWEEN bb.ll AND bb.hh
;
Result:
NOTICE: drop cascades to table tmp.lutser
DROP SCHEMA
CREATE SCHEMA
SET
NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "lutser_pkey" for table "lutser"
CREATE TABLE
INSERT 0 1441
DELETE 1288
hh | ll
---------------------+---------------------
2013-04-30 12:02:00 | 2013-04-30 11:50:00
(1 row)
Wrapping it into a function is left as an excercise for the reader
UPDATE: here is a second one with the sandwiched-not-exists-trick (TM):
SELECT lo.dt AS ll
FROM lutser lo
JOIN lutser hi ON hi.dt > lo.dt
AND NOT EXISTS (
SELECT * FROM lutser nx
WHERE nx.dt < hi.dt
AND nx.dt > lo.dt
)
WHERE '2013-04-30 12:00' BETWEEN lo.dt AND hi.dt
;
You have to join the table to itself with the where condition looking for the smallest nonzero (negative or positive) interval between the base table row's datetime and the joined table row's datetime. It would be good to have an index on that datetime column.
P.S. You could also look for the max() of the previous or the min() of the subsequent.
Try something like:
SELECT *
FROM your_table
WHERE (dt_time > argument_time and search_above = 'true')
OR (dt_time < argument_time and search_above = 'false')
ORDER BY CASE WHEN search_above = 'true'
THEN dt_time - argument_time
ELSE argument_time - dt_time
END
LIMIT 1;

Query with table, temp table, subquery join issues

I have a query where I create 2 temp tables at the start, then I query an existing table in my DB and join this table to a subquery, and then lastly join to 1 of the temp tables. When i do this I get an error that the key I'm joining on from the existing table cannot be bound. What's weird is if I take out all references to the subquery and leave just the query with the existing table and the temp table it joins fine, and also if I joint the existing table to just the sub suery it works just fine.
But when I try to put all 3 together it gives the me the "the multi-part identified z.[currnecy key] cannot be bound", which seems an odd error since this key is in an existing table and joins just fine to a temp table or sub query alone, but not both together.
I know about issues with joining on sub queries, but in this situation it seems that the issue seems to be with joining to sub queries and temp tables in the same query, which I'm not sure how to work around.
Code is below.
declare #tmpFx table (currency_key int, effective_date_key int, expiration_date_key int, to_usd float, from_usd float ) --primary key (currency_key, date_key))
insert into #tmpFx(currency_key, effective_date_key, expiration_date_key, to_usd, from_usd)
select [currency key], cast(convert(char(8),[effective date],112) as int), cast(convert(char(8),[expiration date],112) as int), [to usd], [from usd]
from v_fx --where [effective date] >= #beginDate
declare #fixedFx table (currency_key int, to_usd float, from_usd float primary key (currency_key))
insert into #fixedFx(currency_key, to_usd, from_usd)
select [currency key], [to usd], [from usd]
from v_fx where [effective date] = '2012-01-01'
select z.[currency key], --stat_fx.to_usd to_usd, stat_fx.from_usd from_usd, --q.*,--
stat_usd_amt2 = case when z.[currency key] = 100001 then q.orig_amt else 0 end --sum(q.orig_amt * stat_fx.to_usd)
from [dim country] z,
(select b.country_key, a.currency_key, a.data_type_key, sum(a.amount) orig_amt,
sum(a.amount * stat_fx.to_usd) stat_usd_amt,
sum((a.amount * stat_fx.to_usd) * stat_fx.from_usd) home_curr_amt
from tbl_cohort a
inner join tbl_management_code b on a.management_code = b.management_code
left outer join #tmpFx stat_fx on a.currency_key = stat_fx.currency_key
where a.data_type_key = 1
and a.date_key > 20111231
group by b.country_key, a.currency_key, a.data_type_key) q
inner join #tmpFx stat_fx on z.[currency key] = stat_fx.currency_key
where q.[country_key]= z.[country key]
I believe it is because you are mixing join formats between the old style and new style (as ninesided suggested). I was able to mock up a similar issue with my own data and got the same error about unbound identifiers. Give the following a try instead.
declare #tmpFx table (currency_key int, effective_date_key int, expiration_date_key int, to_usd float, from_usd float)
insert into #tmpFx(currency_key, effective_date_key, expiration_date_key, to_usd, from_usd)
select [currency key], cast(convert(char(8),[effective date],112) as int), cast(convert(char(8),[expiration date],112) as int), [to usd], [from usd]
from v_fx
declare #fixedFx table (currency_key int, to_usd float, from_usd float primary key (currency_key))
insert into #fixedFx(currency_key, to_usd, from_usd)
select [currency key], [to usd], [from usd]
from v_fx where [effective date] = '2012-01-01'
select z.[currency key],
case when z.[currency key] = 100001 then q.orig_amt else 0 end AS stat_usd_amt2
from [dim country] z
JOIN (
select b.country_key, a.currency_key, a.data_type_key, sum(a.amount) AS orig_amt,
sum(a.amount * stat_fx.to_usd) as stat_usd_amt,
sum((a.amount * stat_fx.to_usd) * stat_fx.from_usd) as home_curr_amt
from tbl_cohort a
join tbl_management_code b
on a.management_code = b.management_code
left join #tmpFx stat_fx
on a.currency_key = stat_fx.currency_key
where a.data_type_key = 1
and a.date_key > 20111231
group by b.country_key, a.currency_key, a.data_type_key
) q
ON q.[country_key] = z.[country_key]
join #tmpFx stat_fx
on z.[currency key] = stat_fx.currency_key
And while I've left in your second temp table (#fixedFx), you may want to remove it if you don't have plans on using its data at all.