How to hash aggregated results in postgresql incrementally without concatenating everything first - postgresql

I want to calculate a hash (sha256) of as set of results, I know how to do to it:
SELECT digest( string_agg(id_,':'), 'sha256') from mytable order by id_;
It works but it first concatenates all records and finally calculate the hash. I want it to be incremental to avoid possible memory problems with large datasets (millions of rows), something like:
SELECT digest_agg(id_, ':', 'sha256') from mytable order by id_;
Ensuring that the hash is calculated incrementally row by row on the way.

Ok, I have finally coded a c extension for this. I post it here in case it would be useful for someone.
/* ------- OS Includes ------------ */
#include <stdio.h>
#include <openssl/rsa.h>
#include <openssl/pem.h>
#include <openssl/err.h>
/* ----- PostgreSQL Includes -------*/
#include "postgres.h"
#include "fmgr.h"
#include "funcapi.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
PG_MODULE_MAGIC;
typedef struct sha256_state
{
SHA256_CTX* sha256;
bool has_data;
} SHA256_STATE;
/* --------- Prototypes ---------- */
void _PG_init(void);
void _PG_fini(void);
Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS);
Datum dv_sha256_agg_final(PG_FUNCTION_ARGS);
SHA256_STATE* sha256_init(void);
void sha256_update(SHA256_STATE* state, char* string, int32 string_size);
void sha256_final(SHA256_STATE* state, char outputBuffer[65]);
/* ------ Version1 convention ---- */
PG_FUNCTION_INFO_V1(dv_sha256_agg_sfunc);
PG_FUNCTION_INFO_V1(dv_sha256_agg_final);
// -----------------------------------------------------------
// Implementations
// -----------------------------------------------------------
void _PG_init(void) {/* Do nothing */}
void _PG_fini(void) {/* Do nothing */}
Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS) {
SHA256_STATE* state;
bool is_first = PG_ARGISNULL(0);
text* string;
int32 string_size;
char* data;
char SEP[] = {'\n'}; // Hardcoded separator (can be improved)
if (is_first) {
// First iteration: Create the state
state = sha256_init();
}
else {
// Next iterations: Restore the state
state = (SHA256_STATE*) PG_GETARG_POINTER(0);
}
if (PG_ARGISNULL(1)) {
PG_RETURN_POINTER(state);
}
// Get current chunk
string = PG_GETARG_TEXT_PP(1);
string_size = VARSIZE_ANY_EXHDR(string);
data = VARDATA_ANY(string);
// Add separator
if (state->has_data) {
sha256_update(state, SEP, 1);
}
// Update state
sha256_update(state, data, string_size);
state->has_data = true;
// Updated state
PG_RETURN_POINTER(state);
}
Datum dv_sha256_agg_final(PG_FUNCTION_ARGS) {
SHA256_STATE* state;
char out[65];
text* hash = (text*) palloc(65 + VARHDRSZ);
if (PG_ARGISNULL(0)) {
PG_RETURN_NULL();
}
else {
state = (SHA256_STATE*) PG_GETARG_POINTER(0);
sha256_final(state, out);
SET_VARSIZE(hash, 65 + VARHDRSZ);
memcpy(VARDATA(hash), out, 65);
PG_RETURN_TEXT_P(hash);
}
}
SHA256_STATE* sha256_init() {
SHA256_STATE* state = (SHA256_STATE*) palloc(sizeof(SHA256_STATE));
state->sha256 = (SHA256_CTX*) palloc(sizeof(SHA256_CTX));
SHA256_Init(state->sha256);
state->has_data = false;
return state;
}
void sha256_update(SHA256_STATE* state, char* string, int32 string_size) {
SHA256_Update(state->sha256, string, string_size);
}
void sha256_final(SHA256_STATE* state, char outputBuffer[65]) {
int i;
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_Final(hash, state->sha256);
for(i = 0; i < SHA256_DIGEST_LENGTH; i++) {
sprintf(outputBuffer + (i * 2), "%02x", hash[i]);
}
outputBuffer[64] = 0;
}
The aggregate definition:
CREATE FUNCTION dv_sha256_agg_sfunc(state internal, input text)
RETURNS internal AS 'MODULE_PATHNAME',
'dv_sha256_agg_sfunc'
LANGUAGE C VOLATILE;
CREATE FUNCTION dv_sha256_agg_final(state internal)
RETURNS text AS 'MODULE_PATHNAME',
'dv_sha256_agg_final'
LANGUAGE C VOLATILE;
CREATE AGGREGATE dv_sha256_agg(input text) (
SFUNC = dv_sha256_agg_sfunc,
STYPE = internal,
FINALFUNC = dv_sha256_agg_final
);
Test:
select
dv_sha256_agg(id_::text),
encode(digest(string_agg(id_::text,E'\n'),'sha256'),'hex')
from
generate_series(1,100) id_(id_);
Result
dv_sha256_agg | encode
------------------------------------------------------------------+------------------------------------------------------------------
4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87 | 4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87
(1 row)
Notes:
Separator is hardcoded as \n
Null values are ignored

In PostgreSQL you can create your own aggregate function using CREATE AGGREGATE. That way you can write digest_agg yourself.
The challenge is to come up with a good state transition function (SFUNC) that combines the aggregate of the previous values with the next value.

It is possible to implement such a function yourself. Without touching any C code, I wrote one up in plperlu. It is memory efficient, but horrifically slow. If you want it fast, you will probably need to code it up in C. I suspect that the getstate and putstate functions have had no performance consideration whatsoever, as they weren't intended to be used in loops like this.
CREATE LANGUAGE plperlu;
CREATE FUNCTION public.sha256_final(state text) RETURNS bytea
LANGUAGE plperlu
AS $_X$
use Digest::SHA;
my $sha=Digest::SHA->new(256);
if (defined $_[0]) { $sha->putstate($_[0])};
$sha->add($_[1]);
return encode_bytea($sha->digest());
$_X$;
CREATE FUNCTION public.sha256_sfunc(state text, input text) RETURNS text
LANGUAGE plperlu
AS $_X$
use Digest::SHA;
my $sha=Digest::SHA->new(256);
if (defined $_[0]) { $sha->putstate($_[0])};
$sha->add($_[1]);
return $sha->getstate();
$_X$;
CREATE AGGREGATE public.perl_sha256(input text) (
SFUNC = public.sha256_sfunc,
STYPE = text,
FINALFUNC = public.sha256_final
);
--The built in sha256 on pre-aggregate gives the same answer...
select sha256(string_agg::bytea) from (select string_agg(id_::text,'') from generate_series(1,101) id_(id_)) foobar;
--- ...as my own streaming aggregate.
select perl_sha256(id_::text) from generate_series(1,101) id_(id_);
Of course you have to make sure rows are delivered in the correct order. And I didn't implement the ':' delimiter from string_agg, just ''.

Related

How to return a jsonb object from a PostgreSQL c extension function?

How can I return a simple jsonb object in a PostgreSQL function written in C?
I don't know enough about postgres server side programming. And below is my attempt to return a simple json/jsonb object based on the C source code for hstore_to_jsonb_loose, which is the closest example I can find. I am trying to return {"hi": -101} from the C function, but got an error:
=> ERROR: unexpected jsonb type as object key
Can anyone help exaplain how to get this right?
My C code is:
PG_FUNCTION_INFO_V1(test_return_jsonb);
Datum
test_return_jsonb( PG_FUNCTION_ARGS) {
JsonbParseState *state = NULL;
JsonbValue *res;
StringInfoData tmp;
initStringInfo(&tmp);
(void) pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
JsonbValue key, val;
//key
key.type = jbvString;
key.val.string.len = 2;
key.val.string.val = "hi";
Datum numd;
//value
val.type = jbvNumeric;
numd = DirectFunctionCall3(numeric_in, CStringGetDatum("-101"), //!tmp.data),
ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1));
val.val.numeric = DatumGetNumeric(numd);
(void) pushJsonbValue(&state, WJB_VALUE, &val);
res = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
PG_RETURN_POINTER(JsonbValueToJsonb(res));
}
And the SQL interface code is:
CREATE OR REPLACE FUNCTION test_return_jsonb()
RETURNS jsonb
AS '$libdir/pgtest', 'test_return_jsonb'
LANGUAGE 'c' IMMUTABLE STRICT COST 100; -- Guessed cost
This is with PostgreSQL 12 and Ubuntu 18.04 LTS.
I'm learning too currently and encountered the same issue. I solved it like following (not sure if this is the right way, but it works for now):
// Defined in "/src/backend/utils/adt/numeric.c"
extern Datum int8_numeric(PG_FUNCTION_ARGS);
extern Datum float8_numeric(PG_FUNCTION_ARGS);
extern Datum numeric_int8(PG_FUNCTION_ARGS);
extern Datum numeric_float8(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(test_return_jsonb);
Datum test_return_jsonb(PG_FUNCTION_ARGS) {
JsonbPair *pair = palloc(sizeof(JsonbPair));
pair->key.type = jbvString;
pair->key.val.string.len = 3;
pair->key.val.string.val = "foo";
pair->value.type = jbvNumeric;
pair->value.val.numeric = DatumGetNumeric(DirectFunctionCall1(int8_numeric, (int64_t)100));
JsonbValue *object = palloc(sizeof(JsonbValue));
object->type = jbvObject;
object->val.object.nPairs = 1;
object->val.object.pairs = pair;
PG_RETURN_POINTER(JsonbValueToJsonb(object));
}

Pass String array as input into external C function

I would like to pass a String vector into an external C function.
In a minimal example I just want to pass the String vectors (or 1D array) through the C function.
My Modelica function looks like:
function testreadstri
input String instri[2];
output String outstri[2];
external "C" test_stri(instri,, size(instri, 1), outstri);
annotation (Include="#include <ebcmysql.cpp>", Library="libmysql");
end testreadstri;
My C fucntion looks like:
void test_stri(const char* thestring, size_t nLines, const char **testresult)
{
//bout = 12.3;
size_t iLines;
//size_t nLines;
iLines = 0;
//nLines = 1;
while ( iLines <= nLines ) {
<LINE_OF_INTEREST>
iLines++;
}
}
I tried for <LINE_OF_INTEREST> the following lines:
testresult[iLines] = thestring[iLines];
strcpy(testresult[iLines], thestring[iLines]);
What works, but of course does not pass the input through as an output, is:
testresult[iLines] = "aTestString";
Is there any possibility to handle Modelica input String vectors in the external C function?
Thanks in advance!
Here's a short, self-contained and compilable example demonstrating both input string and output string handling of a pure external function in Modelica
model Model
function testreadstri
input String instri[2];
output String outstri[2];
external "C" test_stri(instri, size(instri, 1), outstri, size(outstri, 1));
annotation(Include="
#include \"ModelicaUtilities.h\"
#include <stdlib.h>
#include <string.h>
void test_stri(const char** thestring, size_t nLinesIn, const char** testresult, size_t nLinesOut)
{
size_t iLines;
// example for input string handling
for (iLines = 0; iLines < nLinesIn; iLines++) {
ModelicaFormatMessage(\"%s\\n\", thestring[iLines]);
}
// example for output string handling
for (iLines = 0; iLines < nLinesOut; iLines++) {
char* line = ModelicaAllocateStringWithErrorReturn(6);
if (line != NULL) {
strcpy(line, \"result\");
testresult[iLines] = line;
}
}
}");
end testreadstri;
String s[:] = testreadstri({"first", "second"});
end Model;
Yes, this is supported by the Modelica specification, see https://specification.modelica.org/v3.4/Ch12.html#argument-type-mapping.

PostgreSQL: Fetch a sequence nextval from a C function

I wrote the following to fetch the next value in a sequence. It works perfectly:
static int64 _get_md_key_next_serial()
{
int ret = SPI_execute("SELECT nextval('md_key_seq')", true, 1);
if (ret <= 0)
return (int64)ret;
if (SPI_processed)
{
SPITupleTable *tuptable = SPI_tuptable;
bool fieldNull;
Datum datum = SPI_getbinval(tuptable->vals[0], tuptable->tupdesc, 1, &fieldNull);
if (!fieldNull)
return DatumGetInt64(datum);
}
return NULL_ZERO;
}
However, surely there is a function call I can make without having to go through SPI?
Unfortunately nextval_internal is not exported, but you could try to call nextval_oid. I did not test this code, so you'll probably have to debug it:
#include "fmgr.h"
#include "commands/sequence.h"
static int64 nextval(Oid sequenceID, FunctionCallInfo fcinfo)
{
FunctionCallInfoData locfcinfo;
InitFunctionCallInfoData(locfcinfo, fcinfo->flinfo, 1,
InvalidOid, NULL, NULL);
locfcinfo.arg[0] = ObjectIdGetDatum(sequenceID);
locfcinfo.argnull[0] = false;
return DatumGetInt64(nextval_oid(&locfcinfo));
}
Pass the Oid of the sequence and the fcinfo from your own SQL function.

function does not exist in pg_proc in postgresql

I tried to call my user defined function in pgresql from C# code,
my function creation script is as follows,
CREATE OR REPLACE FUNCTION public."GetUserDailyData"(
cid integer,
hid integer,
currday integer)
RETURNS character varying AS
$BODY$
DECLARE
returndata varchar = '';
BEGIN
SELECT data->20+currday into returndata FROM pops
WHERE hybid = hid and cropid = cid;
return returndata;
END
$BODY$
LANGUAGE plpgsql
COST 100;
My method to call this function is as follows,
public static object ExecuteScalar(string conString, string spName, NpgsqlParameter[] param)
{
using (var conn = new NpgsqlConnection(conString))
{
conn.Open();
using (var tran = conn.BeginTransaction())
using (var command = conn.CreateCommand())
{
command.CommandText = spName;
command.CommandType = CommandType.StoredProcedure;
for (var i = 0; i < param.Length; i++)
{
command.Parameters.Add(new NpgsqlParameter());
command.Parameters[i] = param[i];
}
var result = command.ExecuteScalar();
return result;
}
}
}
I tried everything even checked the existence of this function in pg_proc using
select * from pg_proc where proname = 'GetUserDailyData'
and it reflected the function details row.
But every time it is giving the same error.
Any kind of suggestion would be highly appreciated. Thanks.
Adding objects with case sensitive names in PostgreSQL can lead to these complications; in this case you need to specify the name of the stored procedure between quotes, however it would be advisable to simply not create any objects that rely on case sensitivity, use underscores instead, or when create/refer to objects using CamelCase without the quotes (which creates/refers to the objects in low-caps). In any case, you may also need to specify the whole interface (not just the name) as the CommandText, and specify the data types of the parameters (see this).
...
command.CommandText = "\"" + spName + "\"";
...

Is it possible to implement the Haversine formula in Objective-C and call it from SQLite?

As I understand, SQLite doesn't have the math functions to properly implement the Haversine formula in straight SQL. I'm thinking this should be possible using an external function, with the implementation being in C.
The goal is to have a SQLite database in an iPhone, and to be able to sort by the distance to the user's current location. I've searched, but I can't find an example of any examples of this being done. I think the difficult parts would be getting the function declarations correct. The end result I'm hoping for, is to be able to execute a SQL statement like:
SELECT * FROM LOCATION loc ORDER BY distance(loc.lat, loc.long, ?, ?)
I have a C Haversine formula. The function definition is as follows:
float distance( float nLat1, float nLon1, float nLat2, float nLon2 );
Does anyone know if this is possible and/or have some example code to start from?
I just had good luck with this post: http://www.thismuchiknow.co.uk/?p=71
This demonstrates a sqlite function that takes in one string parameter and returns a string result.
In your case you would need a function that reads four floats and returns a float but the principle is the same (you would replace sqlite3_value_text with sqlite3_value_double and sqlite3_result_text with sqlite3_result_double):
#include <stdlib.h>
#include <sqlite3.h>
#include <stdio.h>
void haver(sqlite3_context* ctx,int cnt,sqlite3_value** val)
{
printf("In SQLite haver implementation, called for value: %s\n", sqlite3_value_text(*val));
char * resultOfCall = "Result of function call"; //this would call the distance function
sqlite3_result_text(ctx, resultOfCall, strlen(resultOfCall), NULL);
}
int cback (void* udata,int ncol,char** value,char** colname)
{
int i=0;
for(;i<ncol;i++)
printf("Result column: %s value: %s \n", colname[i], value[i]);
return 0;
}
int main()
{
sqlite3 * handle;
int res = sqlite3_open("./test.sql", &handle);
res = sqlite3_create_function(handle, "haver", 1, SQLITE_UTF8, NULL, &haver, NULL, NULL);
char * errmsg = NULL;
res = sqlite3_exec(handle, "select haver(w) from t", &cback, NULL, &errmsg);
printf("sqlite3_exec result: %d %s\n", res, errmsg != NULL ? errmsg : "No error");
sqlite3_close(handle);
}