Related
iam trying to port my sha256 hash function from CPU code to CUDA. after googling, i found few working example for cuda sha256. However when tested, the hash result of cuda sha256 is difference from OpenSSL.
My input is "hello world" which is declared as const char*. result are as below;
Constant Char* Input : hello world
Hash on CPU : b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9
Hash on GPU : c1114db6b517b4db8d360a9e14f5c2a57de95d955ec20cbd4cb73facb2b13e5f
I need help to fix my GPU code for sha256 so that it will produce same hash as given by CPU (OpenSSL).
Here my code for CPU Hash
#pragma warning(disable : 4996) //disable compiler error
#include <iostream>
#include <openssl/sha.h>
unsigned char hash[SHA256_DIGEST_LENGTH];
void SHA256(const char* input, size_t input_size){
SHA256_CTX sha256;
SHA256_Init(&sha256);
SHA256_Update(&sha256, input, input_size);
SHA256_Final(hash, &sha256);
}
void CPU() {
const char* input = "hello world";
size_t input_size = strlen(input);
SHA256(input, input_size);
for (size_t i = 0; i < 32; i++) {
printf("%02x", hash[i]);
}
printf("\n");
}
and Here my code for GPU hash
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define BLOCK_SIZE 256
__constant__ unsigned int k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
__device__ unsigned int Ch(unsigned int x, unsigned int y, unsigned int z) {
return (x & y) ^ (~x & z);
}
__device__ unsigned int Maj(unsigned int x, unsigned int y, unsigned int z) {
return (x & y) ^ (x & z) ^ (y & z);
}
__device__ unsigned int Sigma0(unsigned int x) {
return (x >> 2u) | (x << 30u);
}
__device__ unsigned int Sigma1(unsigned int x) {
return (x >> 6u) | (x << 26u);
}
__device__ unsigned int sigma0(unsigned int x) {
return (x >> 7u) | (x << 25u);
}
__device__ unsigned int sigma1(unsigned int x) {
return (x >> 17u) | (x << 15u);
}
//solve using 256 thread in 1 block
__global__ void sha256_kernel(const char* input, size_t input_size, unsigned char* output) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
size_t grid_size = blockDim.x * gridDim.x;
for (; i < input_size; i += grid_size) {
unsigned int h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
unsigned int w[64];
for (size_t j = 0; j < input_size; j += 64) {
for (size_t t = 0; t < 16; t++) {
w[t] = ((unsigned int)input[j + t * 4 + 0] << 24u) | ((unsigned int)input[j + t * 4 + 1] << 16u) |
((unsigned int)input[j + t * 4 + 2] << 8u) | ((unsigned int)input[j + t * 4 + 3] << 0u);
}
for (size_t t = 16; t < 64; t++) {
w[t] = sigma1(w[t - 2]) + w[t - 7] + sigma0(w[t - 15]) + w[t - 16];
}
unsigned int a = h[0];
unsigned int b = h[1];
unsigned int c = h[2];
unsigned int d = h[3];
unsigned int e = h[4];
unsigned int f = h[5];
unsigned int g = h[6];
unsigned int hh = h[7];
for (size_t t = 0; t < 64; t++) {
unsigned int t1 = hh + Sigma1(e) + Ch(e, f, g) + k[t] + w[t];
unsigned int t2 = Sigma0(a) + Maj(a, b, c);
hh = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
h[0] += a;
h[1] += b;
h[2] += c;
h[3] += d;
h[4] += e;
h[5] += f;
h[6] += g;
h[7] += hh;
}
for (size_t t = 0; t < 8; t++) {
output[i + t * 4 + 0] = (unsigned char)(h[t] >> 24u);
output[i + t * 4 + 1] = (unsigned char)(h[t] >> 16u);
output[i + t * 4 + 2] = (unsigned char)(h[t] >> 8u);
output[i + t * 4 + 3] = (unsigned char)(h[t] >> 0u);
}
}
}
void GPU() {
const char* input = "hello world";
size_t input_size = strlen(input);
size_t output_size = 32;
unsigned char* output;
char* input_device;
cudaMalloc((void**)&output, output_size);
cudaMalloc((void**)&input_device, input_size);
cudaMemcpy(input_device, input, input_size, cudaMemcpyHostToDevice);
//solve using 256 thread and 1 block
sha256_kernel << < ((input_size + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE >> > (input_device, input_size, output);
unsigned char* output_host = (unsigned char*)malloc(output_size);
cudaMemcpy(output_host, output, output_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < output_size; i++) {
printf("%02x", output_host[i]);
}
printf("\n");
free(output_host);
cudaFree(output);
cudaFree(input_device);}
Thanks in advance.
I need to implement my own packets to send over UDP. I decided that I would do this by sending a char buffer which has the sequence number, checksum, size, and the data of the packet which is bytes from a file. The string i'm sending separates each field by a semicolon. Then, when I receive the string (which is my packet) I want to extract each felid, use them accordingly (the sequence number, size, and checksum) and write the bytes to a file. So far I have wrote a method to create 100 packets, and I'm trying to extract and write the bytes to a file (I'm not doing it in the receiver yet, first I'm testing the parsing in the sender). For some reason, the bytes written to my file are incorrect and I'm getting "JPEG DATATSTREAM CONTAINS NO IMAGE" error when I try to open it.
struct packetNode{
char packet[1052]; // this is the entire packet data including the header
struct packetNode *next;
};
This is how I'm creating my packets:
//populate initial window of size 100
for(i = 0; i < 100; i++){
memset(&data[0], 0, sizeof(data));
struct packetNode *p; // create packet node
p = (struct packetNode *)malloc(sizeof(struct packetNode));
bytes = fread(data, 1, sizeof(data), fp); // read 1024 bytes from file into data buffer
int b = fwrite(data, 1, bytes, fpNew);
printf("read: %d\n", bytes);
memset(&p->packet[0], 0, sizeof(p->packet));
sprintf(p->packet, "%d;%d;%d;%s", s, 0, numPackets, data); // create packet
//calculate checksum
int check = checksum8(p->packet, sizeof(p->packet));
sprintf(p->packet, "%d;%d;%d;%s", s, check, numPackets, data); //put checksum in packet
s++; //incremenet sequence number
if(i == 0){
head = p;
tail = p;
tail->next = NULL;
}
else{
tail->next = p;
tail = p;
tail->next = NULL;
}
}
fclose(fp);
and this is where I parse and write the bytes to a file:
void test(){
FILE *fpNew = fopen("test.jpg", "w");
struct packetNode *ptr = head;
char *tokens;
int s, c, size;
int i = 0;
char data[1024];
while(ptr != NULL){
memset(&data[0], 0, sizeof(data));
tokens = strtok(ptr->packet,";");
s = atoi(tokens);
tokens = strtok(NULL, ";");
c = atoi(tokens);
tokens = strtok(NULL, ";");
size = atoi(tokens);
tokens = strtok(NULL, ";");
if(tokens != NULL)
strcpy(data, tokens);
printf("sequence: %d, checksum: %d, size: %d\n", s,c,size);
int b = fwrite(data, 1, sizeof(data), fpNew);
ptr = ptr->next;
i++;
}
fclose(fpNew);
}
Since there is transfer of binary data, a JPEG stream, this data cannot be treated as a string. It's better to go all binary. For instance, instead of
sprintf(p->packet, "%d;%d;%d;%s", s, 0, numPackets, data); // create packet
you would do
sprintf(p->packet, "%d;%d;%d;", s, 0, numPackets);
memcpy(&p->packet[strlen(p->packet)], data, bytes);
but this leads to parsing problems: we would need to change this:
tokens = strtok(NULL, ";");
if(tokens != NULL)
strcpy(data, tokens);
to something like this:
tokens += 1 + ( size < 10 ? 1 : size < 100 ? 2 : size < 1000 ? 3 : size < 10000 ? 4 : 5 );
memcpy(data, tokens, sizeof(data));
#Binary Protocol
It's easier to use a binary packet:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#pragma push(pack,1)
typedef struct Packet {
int seq, maxseq, size;
unsigned short cksum;
unsigned char payload[];
} Packet;
#pragma pop(pack)
typedef struct PacketNode{
struct PacketNode * next;
Packet packet;
} PacketNode;
PacketNode * allocPacketNode(int maxPayloadSize) {
void * ptr = malloc(sizeof(PacketNode) + maxPayloadSize); // FIXME: error checking
memset(ptr, 0, sizeof(PacketNode) + maxPayloadSize); // mallocz wouldn't cooperate
return (PacketNode*) ptr;
}
PacketNode * prepare(FILE * fp, int fsize, int chunksize)
{
PacketNode * head = allocPacketNode(chunksize);
PacketNode * pn = head;
int rd, seq = 0;
int maxseq = fsize / chunksize + ( fsize % chunksize ? 1 : 0 );
while ( ( rd = fread(pn->packet.payload, 1, chunksize, fp ) ) > 0 )
{
printf("read %d bytes\n", rd);
pn->packet.seq = seq++;
pn->packet.maxseq = maxseq;
pn->packet.size = rd + sizeof(Packet);
pn->packet.cksum = 0;
pn->packet.cksum = ~checksum(&pn->packet, pn->packet.size);
if ( rd == chunksize )
pn = pn->next = allocPacketNode(chunksize);
}
return head;
}
int checksum(unsigned char * data, int len)
{
int sum = 0, i;
for ( i = 0; i < len; i ++ )
sum += data[i];
if ( sum > 0xffff )
sum = (sum & 0xffff) + (sum>>16);
return sum;
}
void test( PacketNode * ptr ) {
FILE *fpNew = fopen("test.jpg", "w");
while (ptr != NULL)
{
printf("sequence: %d/%d, checksum: %04x, size: %d\n",
ptr->packet.seq,
ptr->packet.maxseq,
ptr->packet.cksum,
ptr->packet.size - sizeof(Packet)
);
int b = fwrite(ptr->packet.payload, ptr->packet.size - sizeof(Packet), 1, fpNew);
ptr = ptr->next;
}
fclose(fpNew);
}
void fatal( const char * msg ) { printf("FATAL: %s\n", msg); exit(1); }
int main(int argc, char** argv)
{
if ( ! argv[1] ) fatal( "missing filename argument" );
FILE * fp = fopen( argv[1], "r" );
if ( ! fp ) fatal( "cannot open file" );
fseek( fp, 0, SEEK_END );
long fsize = ftell(fp);
fseek( fp, 0, SEEK_SET );
printf("Filesize: %d\n", fsize );
test( prepare(fp, fsize, 1024) );
}
The #pragma push(pack,1) changes how the compiler aligns fields of the struct. We want them to be compact, for network transport. Using 1 is byte-aligned. The #pragma pop(pack) restores the previous setting of the pack pragma.
A note on the checksum method
First we calculate the sum of all the bytes in the packet:
int sum = 0, i;
for ( i = 0; i < len; i ++ )
sum += data[i];
Since the packet uses an unsigned short (16 bits, max value 65535 or 0xffff) to store the checksum, we make sure that the result will fit:
if ( sum > 0xffff ) // takes up more than 16 bits.
Getting the low 16 bits of this int is done using sum & 0xffff, masking out everything but the low 16 bits. We could simply return this value, but we would loose the information from higher checksum bits. So, we will add the upper 16 bits to the lower 16 bits. Accessing the higher 16 bits is done by shifting the int to the right 16 bits, like so: sum >> 16. This is the same as sum / 65536, since 65536 = 216 = 1 << 16.
sum = (sum & 0xffff) + (sum>>16); // add low 16 bits and high 16 bits
I should note that network packet checksums are usually computed 2 bytes (or 'octets' as they like to call them there) at a time. For that, the data should be cast to an unsigned short *, and len should be divided by 2. However! len may be odd, so in that case we'll need to take special care of the last byte. For instance, assuming that the maximum packet size is even, and that the len argument is always <= max_packet_size:
unsigned short * in = (unsigned short *) data;
if ( len & 1 ) data[len] = 0; // make sure last byte is 0
len = (len + 1) / 2;
The rest of the checksum method can remain the same, except that it should operate on in instead of data.
I have a FixMessage and I want to calculate the checksum manually.
8=FIX.4.2|9=49|35=5|34=1|49=ARCA|52=20150916-04:14:05.306|56=TW|10=157|
The body length here is calculated:
8=FIX.4.2|9=49|35=5|34=1|49=ARCA|52=20150916-04:14:05.306|56=TW|10=157|
0 + 0 + 5 + 5 + 8 + 26 + 5 + 0 = 49(correct)
The checksum is 157 (10=157). How to calculate it in this case?
You need to sum every byte in the message up to but not including the checksum field. Then take this number modulo 256, and print it as a number of 3 characters with leading zeroes (e.g. checksum=13 would become 013).
Link from the FIX wiki: FIX checksum
An example implementation in C, taken from onixs.biz:
char *GenerateCheckSum( char *buf, long bufLen )
{
static char tmpBuf[ 4 ];
long idx;
unsigned int cks;
for( idx = 0L, cks = 0; idx < bufLen; cks += (unsigned int)buf[ idx++ ] );
sprintf( tmpBuf, "%03d", (unsigned int)( cks % 256 ) );
return( tmpBuf );
}
Ready-to-run C example adapted from here
8=FIX.4.2|9=49|35=5|34=1|49=ARCA|52=20150916-04:14:05.306|56=TW|10=157|
#include <stdio.h>
void GenerateCheckSum( char *buf, long bufLen )
{
unsigned sum = 0;
long i;
for( i = 0L; i < bufLen; i++ )
{
unsigned val = (unsigned)buf[i];
sum += val;
printf("Char: %02c Val: %3u\n", buf[i], val); // print value of each byte
}
printf("CheckSum = %03d\n", (unsigned)( sum % 256 ) ); // print result
}
int main()
{
char msg[] = "8=FIX.4.2\0019=49\00135=5\00134=1\00149=ARCA\00152=20150916-04:14:05.306\00156=TW\001";
int len = sizeof(msg) / sizeof(msg[0]);
GenerateCheckSum(msg, len);
}
Points to Note
GenerateCheckSum takes the entire FIX message except CheckSum field
Delimiter SOH is written as \001 which has ASCII value 1
static void Main(string[] args)
{
//10=157
string s = "8=FIX.4.2|9=49|35=5|34=1|49=ARCA|52=20150916-04:14:05.306|56=TW|";
byte[] bs = GetBytes(s);
int sum=0;
foreach (byte b in bs)
sum = sum + b;
int checksum = sum % 256;
}
//string to byte[]
static byte[] GetBytes(string str)
{
byte[] bytes = new byte[str.Length * sizeof(char)];
System.Buffer.BlockCopy(str.ToCharArray(), 0, bytes, 0, bytes.Length);
return bytes;
}
Using BodyLength[9] and CheckSum[10] fields.
BodyLength is calculated starting from field starting after BodyLenght and
before CheckSum field.
CheckSum is calculated from ‘8= upto SOH before the checksum field.
Binary value of each character is calculated and compared to the LSB of the calculated value to the checksum value.
If the checksum has been calculated to be 274 then the modulo 256 value is 18 (256 + 18 = 274). This value would be transmitted a 10=018 where
"10="is the tag for the checksum field.
In Java there is a method from QuickFixJ.
String fixStringMessage = "8=FIX.4.29=12535=81=6090706=011=014=017=020=322=837=038=4.39=054=155=ALFAA99=20220829150=0151=06020=06021=06022=F9014=Y";
int checkSum = quickfix.MessageUtils.checksum(fixStringMessage);
System.out.prinln(checkSum);
Output: 127
Hope it can help you.
The following is extremely slow for what I need.
CFStringCreateWithFormat(NULL, NULL, CFSTR("%d"), i);
Currently this takes 20,000ns in my tests to execute on my 3gs. Perhaps that sounds fast, but I can create and release two NSMutableDictionaries in the time this executes. My C is weak, but there must be something equivalent to itoa that I can use on IOS.
This is the faster I can get:
CFStringRef TECFStringCreateWithInteger(NSInteger integer)
{
size_t size = 21; // long enough for 64 bits integer
char buffer[size];
char *characters = buffer + size;
*(--characters) = 0; // NULL-terminated string
int sign = integer < 0 ? -1 : 1;
do {
*(--characters) = '0' + (integer % 10) * sign;
integer /= 10;
}
while ( integer );
if ( sign == -1 )
*(--characters) = '-';
return CFStringCreateWithCString(NULL, characters, kCFStringEncodingASCII);
}
I have used the following code for converting the bigint in decimal to bytearray (raw data), but I'm getting wrong result.
What is the mistake here?
I'm trying this in Apple Mac ( for Iphone app)
COMP_BYTE_SIZE is 4
Is there any bigendian/ little endian issue, please Help.
void bi_export(BI_CTX *ctx, bigint *x, uint8_t *data, int size)
{
int i, j, k = size-1;
check(x);
memset(data, 0, size); /* ensure all leading 0's are cleared */
for (i = 0; i < x->size; i++)
{
for (j = 0; j < COMP_BYTE_SIZE; j++)
{
comp mask = 0xff << (j*8);
int num = (x->comps[i] & mask) >> (j*8);
data[k--] = num;
if (k < 0)
{
break;
}
}
}
Thanks.
The argument size is at least x->size*4, ie. the target array is big enough? Also use
comp mask = (comp)0xff << (j*8);
num should be cast to uint8_t before copy
data[k--] = (uint8_t)num;