CUDA sha256 produce difference hash compared to OpenSSL - hash

iam trying to port my sha256 hash function from CPU code to CUDA. after googling, i found few working example for cuda sha256. However when tested, the hash result of cuda sha256 is difference from OpenSSL.
My input is "hello world" which is declared as const char*. result are as below;
Constant Char* Input : hello world
Hash on CPU : b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9
Hash on GPU : c1114db6b517b4db8d360a9e14f5c2a57de95d955ec20cbd4cb73facb2b13e5f
I need help to fix my GPU code for sha256 so that it will produce same hash as given by CPU (OpenSSL).
Here my code for CPU Hash
#pragma warning(disable : 4996) //disable compiler error
#include <iostream>
#include <openssl/sha.h>
unsigned char hash[SHA256_DIGEST_LENGTH];
void SHA256(const char* input, size_t input_size){
SHA256_CTX sha256;
SHA256_Init(&sha256);
SHA256_Update(&sha256, input, input_size);
SHA256_Final(hash, &sha256);
}
void CPU() {
const char* input = "hello world";
size_t input_size = strlen(input);
SHA256(input, input_size);
for (size_t i = 0; i < 32; i++) {
printf("%02x", hash[i]);
}
printf("\n");
}
and Here my code for GPU hash
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define BLOCK_SIZE 256
__constant__ unsigned int k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
__device__ unsigned int Ch(unsigned int x, unsigned int y, unsigned int z) {
return (x & y) ^ (~x & z);
}
__device__ unsigned int Maj(unsigned int x, unsigned int y, unsigned int z) {
return (x & y) ^ (x & z) ^ (y & z);
}
__device__ unsigned int Sigma0(unsigned int x) {
return (x >> 2u) | (x << 30u);
}
__device__ unsigned int Sigma1(unsigned int x) {
return (x >> 6u) | (x << 26u);
}
__device__ unsigned int sigma0(unsigned int x) {
return (x >> 7u) | (x << 25u);
}
__device__ unsigned int sigma1(unsigned int x) {
return (x >> 17u) | (x << 15u);
}
//solve using 256 thread in 1 block
__global__ void sha256_kernel(const char* input, size_t input_size, unsigned char* output) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
size_t grid_size = blockDim.x * gridDim.x;
for (; i < input_size; i += grid_size) {
unsigned int h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
unsigned int w[64];
for (size_t j = 0; j < input_size; j += 64) {
for (size_t t = 0; t < 16; t++) {
w[t] = ((unsigned int)input[j + t * 4 + 0] << 24u) | ((unsigned int)input[j + t * 4 + 1] << 16u) |
((unsigned int)input[j + t * 4 + 2] << 8u) | ((unsigned int)input[j + t * 4 + 3] << 0u);
}
for (size_t t = 16; t < 64; t++) {
w[t] = sigma1(w[t - 2]) + w[t - 7] + sigma0(w[t - 15]) + w[t - 16];
}
unsigned int a = h[0];
unsigned int b = h[1];
unsigned int c = h[2];
unsigned int d = h[3];
unsigned int e = h[4];
unsigned int f = h[5];
unsigned int g = h[6];
unsigned int hh = h[7];
for (size_t t = 0; t < 64; t++) {
unsigned int t1 = hh + Sigma1(e) + Ch(e, f, g) + k[t] + w[t];
unsigned int t2 = Sigma0(a) + Maj(a, b, c);
hh = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
h[0] += a;
h[1] += b;
h[2] += c;
h[3] += d;
h[4] += e;
h[5] += f;
h[6] += g;
h[7] += hh;
}
for (size_t t = 0; t < 8; t++) {
output[i + t * 4 + 0] = (unsigned char)(h[t] >> 24u);
output[i + t * 4 + 1] = (unsigned char)(h[t] >> 16u);
output[i + t * 4 + 2] = (unsigned char)(h[t] >> 8u);
output[i + t * 4 + 3] = (unsigned char)(h[t] >> 0u);
}
}
}
void GPU() {
const char* input = "hello world";
size_t input_size = strlen(input);
size_t output_size = 32;
unsigned char* output;
char* input_device;
cudaMalloc((void**)&output, output_size);
cudaMalloc((void**)&input_device, input_size);
cudaMemcpy(input_device, input, input_size, cudaMemcpyHostToDevice);
//solve using 256 thread and 1 block
sha256_kernel << < ((input_size + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE >> > (input_device, input_size, output);
unsigned char* output_host = (unsigned char*)malloc(output_size);
cudaMemcpy(output_host, output, output_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < output_size; i++) {
printf("%02x", output_host[i]);
}
printf("\n");
free(output_host);
cudaFree(output);
cudaFree(input_device);}
Thanks in advance.

Related

When approaching the gpio register address of RaspberryPi, why is the result different between unsigned int* and char*?

Using mmap(), I am going to write a value to the GPIO register address of the Raspberry Pi.
I thought the register value would have the same when reading mapped GPIO address in unsigned int * or char *, but it was not. I compared the results for both cases.
This is my code.
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#define GPIO_BASE 0x3F200000
#define GPFSEL1 0x04
#define GPSET0 0x1C
#define GPCLR0 0x28
int main()
{
int fd = open("/dev/mem", O_RDWR|O_SYNC);
// Error Handling
if (fd < 0) {
printf("Can't open /dev/mem \n");
exit(1);
}
// Map pages of memory
char *gpio_memory_map = (char*)mmap(0, 4096, PROT_READ|PROT_WRITE,
MAP_SHARED, fd, GPIO_BASE);
// Error Handling
if (gpio_memory_map == MAP_FAILED) {
printf("Error : mmap \n");
exit(-1);
}
// GPIO18
//volatile unsigned int *gpio = (volatile unsigned int*)gpio_memory_map;
//gpio[GPFSEL1/4] = (1<<24);
volatile char *gpio = (volatile char *)gpio_memory_map;
int i;
for (i = 0; i < 16; i++)
printf("gpio[%d](%#x) = %#0x\n", i, &gpio[i], gpio[i]);
/*
for (i = 0; i < 5; i++) {
gpio[GPCLR0 / 4] = (1 << 18);
sleep(1);
gpio[GPSET0 / 4] = (1 << 18);
sleep(1);
}
*/
// Unmap pages of memory
munmap(gpio_memory_map, 4096);
return 0;
}
And those below are the results.
volatile unsigned int *gpio = (volatile unsigned int *)gpio_memory_map;
gpio[0](0x76f12000) = 0x1
gpio[1](0x76f12004) = 0x1000000
gpio[2](0x76f12008) = 0
gpio[3](0x76f1200c) = 0x3fffffc0
gpio[4](0x76f12010) = 0x24000924
gpio[5](0x76f12014) = 0x924
gpio[6](0x76f12018) = 0
gpio[7](0x76f1201c) = 0x6770696f
gpio[8](0x76f12020) = 0x6770696f
gpio[9](0x76f12024) = 0x6770696f
gpio[10](0x76f12028) = 0x6770696f
gpio[11](0x76f1202c) = 0x6770696f
gpio[12](0x76f12030) = 0x6770696f
gpio[13](0x76f12034) = 0x2ffbbfff
gpio[14](0x76f12038) = 0x3ef4ff
gpio[15](0x76f1203c) = 0
volatile char *gpio = (volatile char *)gpio_memory_map;
As the result #1 above, I thought gpio[1], gpio[2], gpio[3] should be 0. But it was different. And even if I try to write a new value on gpio[1] or gpio[2] or gpio[3], it stays the same. Why are the results different when approaching char * and unsigned char *?
gpio[0](0x76f47000) = 0x1
gpio[1](0x76f47001) = 0x69
gpio[2](0x76f47002) = 0x70
gpio[3](0x76f47003) = 0x67
gpio[4](0x76f47004) = 0
gpio[5](0x76f47005) = 0x69
gpio[6](0x76f47006) = 0x70
gpio[7](0x76f47007) = 0x67
gpio[8](0x76f47008) = 0
gpio[9](0x76f47009) = 0x69
gpio[10](0x76f4700a) = 0x70
gpio[11](0x76f4700b) = 0x67
gpio[12](0x76f4700c) = 0xc0
gpio[13](0x76f4700d) = 0x69
gpio[14](0x76f4700e) = 0x70
gpio[15](0x76f4700f) = 0x67

Saving Nef polyhedron as Polyhedron_3 or Surface_mesh gives different results

I wanted to save a Nef polyhedron into an OFF file for visualizing it. As written in the CGAL Nef polyhedra user manual (see paragraphs 5.4 and 5.5), a Nef polyhedron can be converted both to a Polyhedron_3 or a Surface_mesh.
However, I noticed that when converting to those structures and then saving it into an OFF file, the results are different.
Here I report the code for a minimal example:
#include <list>
#include <iostream>
#include <fstream>
#include <CGAL/Exact_predicates_exact_constructions_kernel.h>
#include <CGAL/Polyhedron_3.h>
#include <CGAL/Nef_polyhedron_3.h>
#include <CGAL/IO/Nef_polyhedron_iostream_3.h>
#include <CGAL/Surface_mesh.h>
#include <CGAL/boost/graph/convert_nef_polyhedron_to_polygon_mesh.h>
typedef CGAL::Exact_predicates_exact_constructions_kernel Kernel;
typedef Kernel::Point_3 Point_3;
typedef CGAL::Surface_mesh<Point_3> Mesh;
typedef CGAL::Polyhedron_3<Kernel> Polyhedron_3;
typedef CGAL::Nef_polyhedron_3<Kernel> Nef_polyhedron;
typedef Kernel::Vector_3 Vector_3;
typedef Kernel::Aff_transformation_3 Aff_transformation_3;
int convertStlToOff(const char* inputFilename, const char* outputFilename)
{
//read 80 bytes and put in std::cerr
std::ifstream obj(inputFilename, std::ios::in | std::ios::binary);
for (int i = 0; i < 80; i++) {
boost::uint8_t c;
obj.read(reinterpret_cast<char*>(&c), sizeof(c));
std::cerr << c;
}
std::cerr << std::endl;
//read 4 bytes and initialize number of triangles
boost::uint32_t N32;
obj.read(reinterpret_cast<char*>(&N32), sizeof(N32));
unsigned int N = N32;
std::cerr << N << " triangles" << std::endl;
//reserve space for N faces
std::vector<Point_3> points;
std::map<Point_3, int> pmap;
typedef boost::tuple<int, int, int> Face;
std::vector<Face> faces;
faces.reserve(N);
//read all faces
int number_of_points = 0;
int number_of_snapped_points = 0;
for (int i = 0; i < N; i++)
{
//read face normal (it is ignored)
float normal[3];
obj.read(reinterpret_cast<char*>(&normal[0]), sizeof(normal[0]));
obj.read(reinterpret_cast<char*>(&normal[1]), sizeof(normal[1]));
obj.read(reinterpret_cast<char*>(&normal[2]), sizeof(normal[2]));
//read coordinates of all 3 points
int index[3];
for (int j = 0; j < 3; j++)
{
float x, y, z;
obj.read(reinterpret_cast<char*>(&x), sizeof(x));
obj.read(reinterpret_cast<char*>(&y), sizeof(y));
obj.read(reinterpret_cast<char*>(&z), sizeof(z));
Point_3 p(x, y, z);
if (pmap.find(p) == pmap.end())
{
// check brute force if there is a close point
bool found_close_point = false;
/*for (int k = 0; k < points.size(); k++)
{
if (sqrt(CGAL::squared_distance(p, points[k])) < 0.00001)
{
index[j] = k;
found_close_point = true;
number_of_snapped_points++;
}
}*/
if (!found_close_point)
{
points.push_back(p);
index[j] = number_of_points;
pmap[p] = number_of_points++;
}
}
else {
index[j] = pmap[p];
}
}
faces.push_back(boost::make_tuple(index[0], index[1], index[2]));
//read two additional bytes, and ignore them
char c;
obj.read(reinterpret_cast<char*>(&c), sizeof(c));
obj.read(reinterpret_cast<char*>(&c), sizeof(c));
}
std::cerr << number_of_snapped_points << " snapped points" << std::endl;
std::ofstream outputFile(outputFilename);
outputFile.precision(20);
outputFile << "OFF\n" << points.size() << " " << faces.size() << " 0" << std::endl;
for (int i = 0; i < points.size(); i++)
{
outputFile << points[i] << std::endl;
}
for (int i = 0; i < faces.size(); i++)
{
outputFile << "3 " << boost::get<0>(faces[i]) << " " << boost::get<1>(faces[i]) << " " << boost::get<2>(faces[i]) << std::endl;
}
return 0;
}
void fill_cube_1(Polyhedron_3 & poly)
{
std::string input =
"OFF\n\
8 12 0\n\
-1 -1 -1\n\
-1 1 -1\n\
1 1 -1\n\
1 -1 -1\n\
-1 -1 1\n\
-1 1 1\n\
1 1 1\n\
1 -1 1\n\
3 0 1 3\n\
3 3 1 2\n\
3 0 4 1\n\
3 1 4 5\n\
3 3 2 7\n\
3 7 2 6\n\
3 4 0 3\n\
3 7 4 3\n\
3 6 4 7\n\
3 6 5 4\n\
3 1 5 6\n\
3 2 1 6";
std::stringstream ss;
ss << input;
ss >> poly;
}
enum savingModality
{
SAVE_AS_POLYHEDRON_3 = 0,
SAVE_AS_SURFACE_MESH = 1,
};
int saveNefObjectInOffFile(Nef_polyhedron offObject, const char* filename, savingModality modality)
{
if (!offObject.is_simple())
{
printf("Object is not simple. Cannot convert to mesh or polyhedron\n");
return 1;
}
std::ofstream outStream;
outStream.open(filename);
if (modality == SAVE_AS_POLYHEDRON_3)
{
Polyhedron_3 outputPolyhedron;
offObject.convert_to_Polyhedron(outputPolyhedron);
outStream << outputPolyhedron;
}
else if (modality == SAVE_AS_SURFACE_MESH)
{
Mesh outputMesh;
CGAL::convert_nef_polyhedron_to_polygon_mesh(offObject, outputMesh);
outStream << outputMesh;
}
outStream.close();
return 0;
}
int main()
{
int ret;
//construct nef object #1
Polyhedron_3 cube1;
fill_cube_1(cube1);
Nef_polyhedron nefObject1(cube1);
//construct nef object #2
Nef_polyhedron nefObject2(cube1);
Aff_transformation_3 scale2(1, 0, 0,
0, 1, 0,
0, 0, 1,
2);
nefObject2.transform(scale2);
Aff_transformation_3 translation2(CGAL::TRANSLATION, Vector_3(-0.5, -0.5, -0.5));
nefObject2.transform(translation2);
//construct nef object #3
Nef_polyhedron nefObject3;
nefObject3 = nefObject1 - nefObject2;
//save results into .off file
ret = saveNefObjectInOffFile(nefObject3, "out1.off", SAVE_AS_POLYHEDRON_3);
ret = saveNefObjectInOffFile(nefObject3, "out2.off", SAVE_AS_SURFACE_MESH);
return 0;
}
and the screenshots of the visualization of the two files: saving as Polyhedron_3 and saving as Surface_mesh. As you can see, it seems like if some faces were missing.
My question is: "Why the results are visualized different?"
The output to Polyhedron_3 is triangulated while the output to Surface_mesh is not. There is a bug in meshlab to display non convex faces I guess.
Look at the doc you'll see that there is a Boolean parameter to trigger or not the triangulation.

uppercase and lowercase in c

I have tried to figure out why the program doesn't work. It turns lowercase to uppercase, let's say I type "k", it returns K. Then I continue typint "A", it doesn't return "a", but exits. But why? Here's the code:
#include <stdio.h>
#include <stdlib.h>
int main(){
char UPPER,LOWER;
printf("Enter UPPERCASE\n");
UPPER = getchar();
if (UPPER >= 65 && UPPER <= 90)
{
UPPER = UPPER + 32;
printf("The UPPERCASE now is %c\n", UPPER);
}
printf("Enter lowercase\n");
LOWER = getchar();
if (LOWER >= 97 && LOWER <= 122)
{
LOWER = LOWER - 32;
printf("The lowercase now is %c\n", LOWER);
}
getchar();
getchar();
}
If you compile and run this code:
void main(void)
{
char c = getchar();
printf("c = %d %c\n", c, c);
c = getchar();
printf("c = %d %c\n", c, c);
}
You will see this output:
user#host ~/work/tmp $ ./test
a
c = 97 a
c = 10
/* new line there*/
This code is not the same, but works:
#include <stdlib.h>
#include <stdio.h>
#define BUFSIZE 4
int main(void)
{
char UPPER[BUFSIZE] = {0}, LOWER[BUFSIZE] = {0};
int i;
printf("Enter UPPERCASE\n");
fgets(UPPER, BUFSIZE, stdin);
for(i = 0; i < BUFSIZE; i++)
{
if (UPPER[i] >= 65 && UPPER[i] <= 90)
{
UPPER[i] = UPPER[i] + 32;
}
}
printf("The UPPERCASE now is %s", UPPER);
printf("Enter LOWERCASE\n");
fgets(LOWER, BUFSIZE, stdin);
for(i = 0; i < BUFSIZE; i++)
{
if (LOWER[i] >= 97 && LOWER[i] <= 122)
{
LOWER[i] = LOWER[i] - 32;
}
}
printf("The LOWERCASE now is %s", LOWER);
return 0;
}
You should separately add getchar();, after printf("The UPPERCASE now is %c\n", UPPER);
and again after printf("The lowercase now is %c\n", LOWER);.
Most of the program is ending with getch(),and so we think that getch() is used to display the output...but it is wrong.It is used to get a single character from the console.
your correct code should be like this:
#include <stdio.h>
#include <stdlib.h>
int main()
{
char UPPER, LOWER;
printf("Enter UPPERCASE\n");
UPPER = getchar();
if (UPPER >= 65 && UPPER <= 90)
{
UPPER = UPPER + 32;
printf("The UPPERCASE now is %c\n", UPPER);
}
getchar();
printf("Enter lowercase\n");
LOWER = getchar();
if (LOWER >= 97 && LOWER <= 122)
{
LOWER = LOWER - 32;
printf("The lowercase now is %c\n", LOWER);
}
getchar();
}

CUFFT: trying to implement row by row fft of a matrix

I am trying to replicate matlab fft functionality, where it does a row by row (or column by column) fft of a matrix. Each row would be one of the batches in the cufft plan.
I can get it working using cufftExecC2C (the commented out part in the code below works), but not cufftExecR2C. My code is using cufftPlan1d, but ideally I want to implement it using cufftPlanMany.
I am wondering what I'm doing wrong, and if there is a better way of doing this. Thank you.
// linker -> input -> additional dependencies -> add 'cufft.lib'
// VC++ Directories -> include directories - > add 'C:\ProgramData\NVIDIA Corporation\CUDA Samples\v6.0\common\inc'
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <iostream>
#define NX 6
#define NY 5
void printArray(float *my_array);
void printComplexArray(float2 *my_array);
int main(){
/************************************************************ C2C ************************************************************/
/*
float2 *initial_array = (float2 *)malloc(sizeof(float2) * NX * NY);
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++){
initial_array[NY * h + w].x = 0;
initial_array[NY * h + w].y = 0;
}
}
initial_array[NY*3 + 0].x = 1;
initial_array[NY*5 + 0].x = 1;
printComplexArray(initial_array);
float2 *transformed_array= (float2 *)malloc(sizeof(float2) * NX * NY);
cufftComplex *gpu_initial_array;
cufftComplex *gpu_transformed_array;
cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftComplex));
cudaMalloc((void **)&gpu_transformed_array, NX*NY*sizeof(cufftComplex));
cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float2), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan1d(&plan, NY, CUFFT_C2C, NX);
cufftExecC2C(plan, gpu_initial_array, gpu_transformed_array, CUFFT_FORWARD);
cudaMemcpy(transformed_array, gpu_transformed_array, NX*NY*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
printComplexArray(transformed_array);
*/
/************************************************************ C2C ************************************************************/
/************************************************************ R2C ************************************************************/
float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
initial_array[NY * h + w] = 0;
}
initial_array[NY*3 + 0] = 1;
printArray(initial_array);
float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);
cufftReal *gpu_initial_array;
cufftComplex *gpu_transformed_array;
cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal));
cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex));
cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan1d(&plan, NY, CUFFT_R2C, NX);
// ***** cufftPlanMany *****
//int n[2] = {NX, NY};
//cufftPlanMany(&plan,1,n,NULL,1,0,NULL,1,0,CUFFT_R2C,NX);
cufftExecR2C(plan, gpu_initial_array, gpu_transformed_array);
cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
printComplexArray(transformed_array);
/************************************************************ R2C ************************************************************/
cufftDestroy(plan);
free(initial_array);
free(transformed_array);
cudaFree(gpu_initial_array);
cudaFree(gpu_transformed_array);
std::system("pause");
return 0;
}
void printArray(float *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
std::cout << my_array[NY * h + w] << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}
void printComplexArray(float2 *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
std::cout << my_array[NY * h + w].x << " + " << my_array[NY * h + w].y << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}
It seems that your issue resides in the way you print out the result. You cannot use the same routine to print for the two cases of CUFFT_R2C and CUFFT_C2C. In the former case, you have a (NY/2+1)*NX sized output, while the the latter case you have a NY*NX sized output. The fixed code below should work.
Also, it would be also good to add proper CUDA error check and CUFFT error check, which I have also added to the code below.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <iostream>
#define NX 6
#define NY 5
void printArray(float *my_array);
void printComplexSymmetricArray(float2 *my_array);
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
int main(){
float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
initial_array[NY * h + w] = 0;
}
initial_array[NY*3 + 0] = 1;
printArray(initial_array);
float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);
cufftReal *gpu_initial_array;
cufftComplex *gpu_transformed_array;
gpuErrchk(cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal)));
gpuErrchk(cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex)));
gpuErrchk(cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice));
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, NY, CUFFT_R2C, NX));
cufftSafeCall(cufftExecR2C(plan, (cufftReal*)gpu_initial_array, (cufftComplex*)gpu_transformed_array));
gpuErrchk(cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
printComplexSymmetricArray(transformed_array);
cufftSafeCall(cufftDestroy(plan));
free(initial_array);
free(transformed_array);
gpuErrchk(cudaFree(gpu_initial_array));
gpuErrchk(cudaFree(gpu_transformed_array));
std::system("pause");
return 0;
}
/***********************/
/* PRINTOUT REAL ARRAY */
/***********************/
void printArray(float *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY; w++)
std::cout << my_array[NY * h + w] << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}
/************************************/
/* PRINTOUT COMPLEX SYMMETRIC ARRAY */
/************************************/
void printComplexSymmetricArray(float2 *my_array){
for (int h = 0; h < NX; h++){
for (int w = 0; w < NY/2+1; w++)
std::cout << my_array[(NY/2+1) * h + w].x << " + " << my_array[(NY/2+1) * h + w].y << " | ";
std::cout << std::endl;
}
std::cout << std::endl;
}

Windows C API for UTF8 to 1252

I'm familiar with WideCharToMultiByte and MultiByteToWideChar conversions and could use these to do something like:
UTF8 -> UTF16 -> 1252
I know that iconv will do what I need, but does anybody know of any MS libs that will allow this in a single call?
I should probably just pull in the iconv library, but am feeling lazy.
Thanks
Windows 1252 is mostly equivalent to latin-1, aka ISO-8859-1: Windows-1252 just has some additional characters allocated in the latin-1 reserved range 128-159. If you are ready to ignore those extra characters, and stick to latin-1, then conversion is rather easy. Try this:
#include <stddef.h>
/*
* Convert from UTF-8 to latin-1. Invalid encodings, and encodings of
* code points beyond 255, are replaced by question marks. No more than
* dst_max_len bytes are stored in the destination array. Returned value
* is the length that the latin-1 string would have had, assuming a big
* enough destination buffer.
*/
size_t
utf8_to_latin1(char *src, size_t src_len,
char *dst, size_t dst_max_len)
{
unsigned char *sb;
size_t u, v;
u = v = 0;
sb = (unsigned char *)src;
while (u < src_len) {
int c = sb[u ++];
if (c >= 0x80) {
if (c >= 0xC0 && c < 0xE0) {
if (u == src_len) {
c = '?';
} else {
int w = sb[u];
if (w >= 0x80 && w < 0xC0) {
u ++;
c = ((c & 0x1F) << 6)
+ (w & 0x3F);
} else {
c = '?';
}
}
} else {
int i;
for (i = 6; i >= 0; i --)
if (!(c & (1 << i)))
break;
c = '?';
u += i;
}
}
if (v < dst_max_len)
dst[v] = (char)c;
v ++;
}
return v;
}
/*
* Convert from latin-1 to UTF-8. No more than dst_max_len bytes are
* stored in the destination array. Returned value is the length that
* the UTF-8 string would have had, assuming a big enough destination
* buffer.
*/
size_t
latin1_to_utf8(char *src, size_t src_len,
char *dst, size_t dst_max_len)
{
unsigned char *sb;
size_t u, v;
u = v = 0;
sb = (unsigned char *)src;
while (u < src_len) {
int c = sb[u ++];
if (c < 0x80) {
if (v < dst_max_len)
dst[v] = (char)c;
v ++;
} else {
int h = 0xC0 + (c >> 6);
int l = 0x80 + (c & 0x3F);
if (v < dst_max_len) {
dst[v] = (char)h;
if ((v + 1) < dst_max_len)
dst[v + 1] = (char)l;
}
v += 2;
}
}
return v;
}
Note that I make no guarantee about this code. This is completely untested.