is required to read data from a binary file without loading them in Bitmap because it is too much, more than 20000x20000 pixels, I need to open a file, one line at a time to read a file for processing. found an example for reading BMP, can not understand how in the same way to get data from PNG.
byte[] B = File.ReadAllBytes(filename);
GCHandle GCH = GCHandle.Alloc(B, GCHandleType.Pinned);
IntPtr Scan0 = (IntPtr)((int)(GCH.AddrOfPinnedObject()) + 54);
int W = Marshal.ReadInt32(Scan0, -36);
int H = Marshal.ReadInt32(Scan0, -32);
Bitmap Bmp = new Bitmap(W, H, 4 * W, PixelFormat.Format32bppArgb, Scan0);
GCH.Free();
return Bmp;
Language C#
I found a library (PNGChunkParser) with parsing blocks APG has received all the blocks that are in the file, I found a 21 unit Idate, but trying to paint them 50 lines, to be not the right picture:
byte[] B = File.ReadAllBytes(filename);
List<byte> tmpb = new List<byte>();
using (MemoryStream stream = new MemoryStream(B))
{
PNGChunk[] chunks = PNGChunkParser.ChunksFromStream(stream).ToArray();
foreach (PNGChunk item in chunks)
{
if (item.TypeString == "IDAT")
{
PNGChunk idatChunk = item;
foreach (byte s in idatChunk.Data)
{
tmpb.Add(s);
}
}
}
}
var size_png = ImageHelper.GetDimensions(filename);
GetConturPic(tmpb.ToArray(), size_png.Width, 50,PixelFormat.Format32bppArgb);
private void GetConturPic(byte[] data, int w, int h, PixelFormat pixel_format)
{
int index;
int stride = GetStride(w, pixel_format);
Bitmap bm = new Bitmap(w, h);
Color fm = new Color();
try
{
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
index = y * stride + 4 * x;
fm = GetPixelColor(data, w, h, x, y,
System.Drawing.Bitmap.GetPixelFormatSize(pixel_format));
bm.SetPixel(x, y, fm);
}
pictureBox1.Image = bm;
pictureBox1.Refresh();
}
}
catch (Exception ex)
{
listBox1.Items.Add(ex.Message);
}
}
public Color GetPixelColor(byte[] Pixels, int w, int h, int x, int y, int depth)
{
Color clr = Color.Empty;
// Get color components count
int cCount = depth / 8;
// Get start index of the specified pixel
int i = ((y * w) + x) * cCount;
if (i > Pixels.Length - cCount)
throw new IndexOutOfRangeException();
byte b, g, a, r, c;
switch (depth)
{
case 32:
b = Pixels[i];
g = Pixels[i + 1];
r = Pixels[i + 2];
a = Pixels[i + 3]; // a
clr = Color.FromArgb(a, b, g, r);
break;
case 24:
b = Pixels[i];
g = Pixels[i + 1];
r = Pixels[i + 2];
clr = Color.FromArgb(b, g, r);
break;
case 8:
c = Pixels[i];
clr = Color.FromArgb(c, c, c);
break;
}
return clr;
}
help what need to do to get 50 lines of image ?
Related
I'm using this code to convert RGB to HSV.
now i have to revert back to RGB.
is that any way to convert this in flutter??
I just recently started working in flutter, please help me out, I'm stuck here.
Thank you so much in advance.
rgbToHsv(int r1, int g1, int b1) {
// R, G, B values are divided by 255
// to change the range from 0..255 to 0..1:
double r = r1 / 255.0;
double g = g1 / 255.0;
double b = b1 / 255.0;
// h, s, v = hue, saturation, value
var cmax = [r, g, b].reduce(max); // maximum of r, g, b
var cmin = [r, g, b].reduce(min); // minimum of r, g, b
var diff = cmax - cmin; // diff of cmax and cmin.
var h;
var s;
var v;
//Get value of h
if (cmax == cmin) {
h = 0;
} else if (cmax == r) {
h = (60 * ((g - b) / diff) + 360) % 360;
} else if (cmax == g) {
h = (60 * ((b - r) / diff) + 120) % 360;
} else if (cmax == b) {
h = (60 * ((r - g) / diff) + 240) % 360;
}
//Get value of s
if (cmax == 0) {
s = 0;
} else {
s = (diff / cmax) * 100;
}
//Get value of v
v = cmax * 100;
//Convert HSV [360, 100, 100] to HSV [256, 256, 256]
double h_256 = (h / 360) * 255;
double s_256 = (s / 100) * 255;
double v_256 = (v / 100) * 255;
int h_256_int = h_256.toInt();
int s_256_int = s_256.toInt();
int v_256_int = v_256.toInt();
//Convert to HSV HEX
var hex_h = h_256_int.toRadixString(16);
var hex_s = s_256_int.toRadixString(16);
var hex_v = v_256_int.toRadixString(16);
//MERGE HSV HEX
var finalColor = hex_h + hex_s + hex_v;
print("HSV HEX:" + finalColor.toUpperCase());
/////RGB TRANS>>>>>>>>>>>>>>>>>
////////RGB TRANS>>>>>>>>>>>>>>>>>
////////RGB TRANS>>>>>>>>>>>>>>>>>
////////RGB TRANS>>>>>>>>>>>>>>>>>
var _h = hextToint(hex_h);
var _s = hextToint(hex_s);
var _v = hextToint(hex_v);
print(_h);
print(_s);
print(_v);
print(hsvToRgb(_h.toDouble(), _s.toDouble(), _v.toDouble()));
//return rgb;
}
this might help. I have used this in one of my projects.
String hsvToRgb(double H, double S, double V) {
int R, G, B;
H /= 360;
S /= 100;
V /= 100;
if (S == 0) {
R = (V * 255).toInt();
G = (V * 255).toInt();
B = (V * 255).toInt();
} else {
double var_h = H * 6;
if (var_h == 6) var_h = 0; // H must be < 1
int var_i = var_h.floor(); // Or ... var_i =
// floor( var_h )
double var_1 = V * (1 - S);
double var_2 = V (1 - S (var_h - var_i));
double var_3 = V (1 - S (1 - (var_h - var_i)));
double var_r;
double var_g;
double var_b;
if (var_i == 0) {
var_r = V;
var_g = var_3;
var_b = var_1;
} else if (var_i == 1) {
var_r = var_2;
var_g = V;
var_b = var_1;
} else if (var_i == 2) {
var_r = var_1;
var_g = V;
var_b = var_3;
} else if (var_i == 3) {
var_r = var_1;
var_g = var_2;
var_b = V;
} else if (var_i == 4) {
var_r = var_3;
var_g = var_1;
var_b = V;
} else {
var_r = V;
var_g = var_1;
var_b = var_2;
}
R = (var_r * 255).toInt(); // RGB results from 0 to 255
G = (var_g * 255).toInt();
B = (var_b * 255).toInt();
}
String rs = R.toRadixString(16);
String gs = G.toRadixString(16);
String bs = B.toRadixString(16);
if (rs.length == 1) rs = "0" + rs;
if (gs.length == 1) gs = "0" + gs;
if (bs.length == 1) bs = "0" + bs;
return "#" + rs + gs + bs;
}
RGB to HSV
HSVColor rgbToHSV(int r, int g, int b, {double opacity = 1}) {
return HSVColor.fromColor(Color.fromRGBO(r, g, b, opacity));
}
HSV to RGB
List<int> hsvToRGB(HSVColor color) {
//convert to color
final c = color.toColor();
return [c.red, c.blue, c.green];
}
To use HSVColor use myHSVcolor.toColor().
More about HSV Color.
Expanding on Yeasin Sheikh answer
Flutter
RGB to HSV
HSVColor rgbToHSV(int r, int g, int b, {double opacity = 1}) {
return HSVColor.fromColor(Color.fromRGBO(r, g, b, opacity));
}
HSV to RGB
List<int> hsvToRGB(HSVColor color) {
//convert to color
final c = color.toColor();
return [c.red, c.blue, c.green];
}
To use HSVColor use myHSVcolor.toColor().
More about HSV Color.
Native Dart
Import color package
RGB to HSV
final RgbColor rgbColor = RgbColor(red, green, blue);
final HsvColor hsvColor = rgbColor.toHsvColor();
HSV to RGB
final HsvColor hsvColor = HsvColor(hueValue, saturationValue, valueValue)
final RgbColor rgbColor = hsvColor.toRgbColor();
HSVColor class is not supported in native dart so we are using a package for that with similar classes.
I’m new to processing. I am looking to write a program in which I select an area of an image ( like “rect selection” in Photoshop for example…), this area has a red stroke and is slightly opaque, once the area is selected, the rectangle fills with the average color of the pixels of the same area and the red stroke turns off. The idea is to be able to repeat this action several times on the same image. When I released the mouse, the fill is erased because of background(buff); in void draw();. I would like the rectangle filled with the new color to be saved. I think I need to use an array, class, but I don’t understand how these work. If someone is able to help me, it would be a great help. Thank you.
PImage buff1;
int x1,y1,x2,y2,h1,h2;
void setup()
{
size(1000, 721);
buff1 = loadImage("buff1.jpg2);
background(buff1);
}
color extractColorFromImage(final PImage buff1) {
buff1.loadPixels();
color r = 1, g = 1, b = 1;
for (final color c : buff1.pixels) {
r += c >> 020 & 255;
g += c >> 010 & 255;
b += c & 255;
}
r /= buff1.pixels.length;
g /= buff1.pixels.length;
b /= buff1.pixels.length;
return color(r, g, b);
}
void draw()
{
background(buff1);
rectMode(CORNERS);
stroke(255,0,0);
strokeWeight(2);
strokeJoin(ROUND);
rect(x1,y1,x2,y2,2);
fill(255,0,0,50);
noStroke();
cursor(ARROW);
}
void mousePressed()
{
x1 = mouseX;
y1 = mouseY;
}
void mouseDragged()
{
x2 = mouseX;
y2 = mouseY;
}
void mouseReleased()
{
int H1 = abs(1+x2-x1);
int H2 = abs(1+y2-y1);
for (int i=0; i<width; i+=H1)
{
for (int j=0; j<height; j+=H2)
{
PImage newImg = buff1.get(x1, y1, H1, H2);
fill(extractColorFromImage(newImg), 40);
noStroke();
cursor(ARROW);
}
}
}
Once the pixel data for an image have been loaded by loadPixels(), the loaded pixels can be accessed and changed by pixels[].
updatePixels() updates the image with the data in its pixels[] array:
void mouseReleased()
{
int x_1 = min(x1,x2);
int y_1 = min(y1,y2);
int x_2 = max(x1,x2);
int y_2 = max(y1,y2);
PImage newImg = buff1.get(x_1, y_1, x_2-x1+1, y_2-y1+1);
color new_color = extractColorFromImage(newImg);
buff1.loadPixels();
for (int i = x_1; i <= x_2; i++)
{
for (int j = y_1; j <= y_2; j++)
{
buff1.pixels[j*buff1.width+i] = new_color;
}
}
buff1.updatePixels();
}
When the mouse button is pressed, then I recommend to set (x2, y2), too:
void mousePressed()
{
x1 = mouseX;
y1 = mouseY;
x2 = x1;
y2 = y1;
}
Optionally the original color can be mixed with the new color by lerpColor():
PImage newImg = buff1.get(x_1, y_1, x_2-x1+1, y_2-y1+1);
color new_color = extractColorFromImage(newImg);
buff1.loadPixels();
for (int i = x_1; i <= x_2; i++)
{
for (int j = y_1; j <= y_2; j++)
{
color orig_color = buff1.pixels[j*buff1.width+i];
buff1.pixels[j*buff1.width+i] = lerpColor(orig_color, new_color, 0.5);
}
}
buff1.updatePixels();
I want to convert camera image from function startImageStream of camera plugin in Flutter to Image to crop that image but I only find the way to convert to FirebaseVisionImage.
Edit For Color Image
if I understand you clear. You are trying to covnert YUV420 format. The following is code snippet from: https://github.com/flutter/flutter/issues/26348
const shift = (0xFF << 24);
Future<Image> convertYUV420toImageColor(CameraImage image) async {
try {
final int width = image.width;
final int height = image.height;
final int uvRowStride = image.planes[1].bytesPerRow;
final int uvPixelStride = image.planes[1].bytesPerPixel;
print("uvRowStride: " + uvRowStride.toString());
print("uvPixelStride: " + uvPixelStride.toString());
// imgLib -> Image package from https://pub.dartlang.org/packages/image
var img = imglib.Image(width, height); // Create Image buffer
// Fill image buffer with plane[0] from YUV420_888
for (int x = 0; x < width; x++) {
for (int y = 0; y < height; y++) {
final int uvIndex = uvPixelStride * (x / 2).floor() + uvRowStride * (y / 2).floor();
final int index = y * width + x;
final yp = image.planes[0].bytes[index];
final up = image.planes[1].bytes[uvIndex];
final vp = image.planes[2].bytes[uvIndex];
// Calculate pixel color
int r = (yp + vp * 1436 / 1024 - 179).round().clamp(0, 255);
int g = (yp - up * 46549 / 131072 + 44 - vp * 93604 / 131072 + 91).round().clamp(0, 255);
int b = (yp + up * 1814 / 1024 - 227).round().clamp(0, 255);
// color: 0x FF FF FF FF
// A B G R
img.data[index] = shift | (b << 16) | (g << 8) | r;
}
}
imglib.PngEncoder pngEncoder = new imglib.PngEncoder(level: 0, filter: 0);
List<int> png = pngEncoder.encodeImage(img);
muteYUVProcessing = false;
return Image.memory(png);
} catch (e) {
print(">>>>>>>>>>>> ERROR:" + e.toString());
}
return null;
}
I found, sometimes planes[0] bytes per row is not same as width. In that case, you should do something like the code below.
static image_lib.Image convertYUV420ToImage(CameraImage cameraImage) {
final width = cameraImage.width;
final height = cameraImage.height;
final yRowStride = cameraImage.planes[0].bytesPerRow;
final uvRowStride = cameraImage.planes[1].bytesPerRow;
final uvPixelStride = cameraImage.planes[1].bytesPerPixel!;
final image = image_lib.Image(width, height);
for (var w = 0; w < width; w++) {
for (var h = 0; h < height; h++) {
final uvIndex =
uvPixelStride * (w / 2).floor() + uvRowStride * (h / 2).floor();
final index = h * width + w;
final yIndex = h * yRowStride + w;
final y = cameraImage.planes[0].bytes[yIndex];
final u = cameraImage.planes[1].bytes[uvIndex];
final v = cameraImage.planes[2].bytes[uvIndex];
image.data[index] = yuv2rgb(y, u, v);
}
}
return image;
}
static int yuv2rgb(int y, int u, int v) {
// Convert yuv pixel to rgb
var r = (y + v * 1436 / 1024 - 179).round();
var g = (y - u * 46549 / 131072 + 44 - v * 93604 / 131072 + 91).round();
var b = (y + u * 1814 / 1024 - 227).round();
// Clipping RGB values to be inside boundaries [ 0 , 255 ]
r = r.clamp(0, 255);
g = g.clamp(0, 255);
b = b.clamp(0, 255);
return 0xff000000 |
((b << 16) & 0xff0000) |
((g << 8) & 0xff00) |
(r & 0xff);
}
In Matlab, there is both repmat and repelem. For matrices x1 and x2 of sizes [d, n1] and [d, n2], you can do this in Matlab:
n1 = size(x1, 2);
n2 = size(x2, 2);
M = repmat(x1, 1, n2) - repelem(x2, 1, n1);
What is the equivalent Eigen code? Below are four variants that I am not quite satisfied with. I wonder if it could be made into a faster one-liner?
TL;DR: Variant 2 is best, but it depends on the compiler and other things.
int d = x1.rows();
int n1 = x1.cols();
int n2 = x2.cols();
Eigen::MatrixXd M(d, n1*n2);
// Variant 1:
int idx = 0;
for (int c = 0; c != n2; c++) {
for (int r = 0; r != n1; r++) {
M.col(idx) = x1.col(r) - x2.col(c);
idx += 1;
}
}
// Variant 2:
for (int c = 0, idx = 0; c != n2; c += 1, idx += n1)
M.block(0, idx, d, n1) = x1.colwise() - x2.col(c);
// Variant 3:
M = - x2.replicate(n1, 1);
M.resize(d, n1*n2);
M += x1.replicate(1, n2);
// Variant 5:
M = x1.rowwise().replicate(n2) - x2(all,VectorXi::LinSpaced(n1*n2,0,n2-1));
Here are my current timings, see below for the complete code. Matlab timings are for multithread and single thread Matlab R2017b. C++ versions compiled with flags -O3 -DNDEBUG -march=native -mtune=native. All were run on a i5-6500, so I have AVX.
time in seconds
Code gcc-7 gcc-8 clang-6
-----------------------------------
Matlab mt 51
Matlab st 84
V. 1 38 37 57
V. 2 36 34 23
V. 3 598 599 187
V. 5 94 172 107
Matlab code:
ds = 1:10;
n1s = 5:5:500;
n2s = 5:5:500;
z1 = randn(max(ds), max(n1s));
z2 = randn(max(ds), max(n2s));
tic;
s = 0;
for idx = 1:numel(ds)
for jdx = 1:numel(n1s)
for kdx = 1:numel(n2s)
K = MFdiff(z1(1:ds(idx), 1:n1s(jdx)),...
z2(1:ds(idx), 1:n2s(kdx)));
s = s + K(1,1);
end
end
end
toc
C++ code:
#include <iostream>
#include <Eigen/Dense>
using namespace Eigen;
template <typename Derived1, typename Derived2>
MatrixXd MFdiff1(
DenseBase<Derived1> const & x1,
DenseBase<Derived2> const & x2)
{
int d = x1.rows();
int n1 = x1.cols();
int n2 = x2.cols();
MatrixXd out(d, n1*n2);
int idx = 0;
for (int c = 0; c != n2; c++) {
for (int r = 0; r != n1; r++) {
out.col(idx) = x1.col(r) - x2.col(c);
idx += 1;
}
}
return out;
}
template <typename Derived1, typename Derived2>
MatrixXd MFdiff2(
DenseBase<Derived1> const & x1,
DenseBase<Derived2> const & x2)
{
int d = x1.rows();
int n1 = x1.cols();
int n2 = x2.cols();
MatrixXd out(d, n1*n2);
for (int c = 0, idx = 0; c != n2; c+=1, idx += n1)
out.block(0, idx, d, n1) = x1.colwise() - x2.col(c);
return out;
}
template <typename Derived1, typename Derived2>
MatrixXd MFdiff3(
DenseBase<Derived1> const & x1,
DenseBase<Derived2> const & x2)
{
int d = x1.rows();
int n1 = x1.cols();
int n2 = x2.cols();
MatrixXd out;
out = - x2.replicate(n1, 1);
out.resize(d, n1*n2);
out += x1.replicate(1, n2);
return out;
}
template <typename Derived1, typename Derived2>
MatrixXd MFdiff5(
DenseBase<Derived1> const & x1,
DenseBase<Derived2> const & x2)
{
int n1 = x1.cols();
int n2 = x2.cols();
return x1.rowwise().replicate(n2) - x2(all,VectorXi::LinSpaced(n1*n2,0,n2-1));
}
double test(VectorXi const & ds,
VectorXi const & n1s,
VectorXi const & n2s)
{
MatrixXd z1 = MatrixXd::Random(ds.maxCoeff(), n1s.maxCoeff());
MatrixXd z2 = MatrixXd::Random(ds.maxCoeff(), n2s.maxCoeff());
double s = 0;
for (int idx = 0; idx!=ds.rows(); idx++) {
for (int jdx = 0; jdx!=n1s.rows(); jdx++) {
for (int kdx = 0; kdx!=n2s.rows(); kdx++) {
MatrixXd K = MFdiff5(z1.block(0, 0, ds(idx), n1s(jdx)),
z2.block(0, 0, ds(idx), n2s(kdx)));
s += K(0,0);
}
}
}
return s;
}
int main() {
VectorXi ds = VectorXi::LinSpaced(10, 1, 10);
VectorXi n1s = VectorXi::LinSpaced(100, 5, 500);
VectorXi n2s = VectorXi::LinSpaced(100, 5, 500);
std::cout << test(ds, n1s, n2s) << '\n';
}
With the head of Eigen you can write:
M = x1.rowwise().replicate(n2) - x2(Eigen::all,VectorXi::LinSpaced(n1*n2,0,n2-1));
with same speed as your variant 2.
Self-contained benchmark (BenchTimer.h needs a clone of the repo), tested with -O3 -DNDEBUG with gcc 7 and clang 6:
#include <iostream>
#include <Eigen/Dense>
#include <bench/BenchTimer.h>
using namespace Eigen;
using namespace std;
EIGEN_DONT_INLINE
void foo1(const MatrixXd& x1, const MatrixXd& x2, MatrixXd& M)
{
int d = x1.rows();
int n1 = x1.cols();
int n2 = x2.cols();
int idx = 0;
for (int c = 0; c != n2; c++) {
M.block(0, idx, d, n1) = x1.colwise() - x2.col(c);
idx += n1;
}
}
EIGEN_DONT_INLINE
void foo2(const MatrixXd& x1, const MatrixXd& x2, MatrixXd& M)
{
int n1 = x1.cols();
int n2 = x2.cols();
M = x1.rowwise().replicate(n2) - x2(all,VectorXi::LinSpaced(n1*n2,0,n2-1));
}
int main()
{
int tries = 2;
int rep = 1;
int d = 100;
int n1 = 100;
int n2 = 100;
MatrixXd x1(d,n1); x1.setRandom();
MatrixXd x2(d,n2); x2.setRandom();
MatrixXd M(d, n1*n2);
BenchTimer t;
BENCH(t, tries, rep, foo1(x1, x2, M));
std::cout << "Time: " << t.best() << "s" << std::endl;
BENCH(t, tries, rep, foo2(x1, x2, M));
std::cout << "Time: " << t.best() << "s" << std::endl;
}
Its been two days and I am still cant figure it out why my implementation of CUDA matrix multiplication differs from the results produced in MATLAB.
CUDA kernel: A(200x60000) = W(200x784) * Data(784x6000)
__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < W.row) && (Col < Data.col)){
float Cvalue = 0.0;
for (int i = 0; i < W.col; ++i){
Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
}
A.elements[Row*A.col+Col] = Cvalue;
}
}
And calling the kernel:
void myFunc(Matrix W1, Matrix data){
Matrix d_W1, d_data, d_a2, a2;
size_t size;
a2.row = W1.row; d_a2.row = a2.row;
a2.col = data.col; d_a2.col = a2.col;
size = a2.col*a2.row*sizeof(float);
cudaMalloc(&d_a2.elements,size);
d_W1.row = W1.row; d_W1.col = W1.col;
size = W1.col*W1.row*sizeof(float);
cudaMalloc(&d_W1.elements,size);
cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);
d_data.col = data.col; d_data.row = data.row;
size = data.row*data.col*sizeof(float);
cudaMalloc(&d_data.elements,size);
cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
dim3 dimBlock(32, 32, 1);
CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
a2.elements = new float [a2.row*a2.col];
cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);
printf("\nA2 first and last member %f - %f\n",a2.elements[0],a2.elements[a2.row*a2.col-1]);
}
Results difference is not low for example first and last elements of CUDA code is 0.011322 and -0.179534 but multiplying in MATLAB results in 0.4280 and 0.0056.
this is how I do it in MATLAB:
>> size(W1) ans = 200 784
>> size(data) ans = 784 60000
>> z2=W1*data;
>> size(z2) ans = 200 60000
>> z2 = z2(:);
>> z2(1) ans = 0.4280
>> z2(200*60000)ans = 0.0056
There is nothing wrong with the code you posted. If I expand your kernel and function into a complete running example like this:
#include <iostream>
struct Matrix
{
int row;
int col;
float *elements;
__device__ __host__
float& operator()(int r, int c) { return elements[r*col + c]; };
};
__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < W.row) && (Col < Data.col)){
float Cvalue = 0.0;
for (int i = 0; i < W.col; ++i){
Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
}
A.elements[Row*A.col+Col] = Cvalue;
}
}
void myFunc(Matrix W1, Matrix data)
{
Matrix d_W1, d_data, d_a2, a2;
size_t size;
a2.row = W1.row; d_a2.row = a2.row;
a2.col = data.col; d_a2.col = a2.col;
size = a2.col*a2.row*sizeof(float);
cudaMalloc(&d_a2.elements,size);
d_W1.row = W1.row; d_W1.col = W1.col;
size = W1.col*W1.row*sizeof(float);
cudaMalloc(&d_W1.elements,size);
cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);
d_data.col = data.col; d_data.row = data.row;
size = data.row*data.col*sizeof(float);
cudaMalloc(&d_data.elements,size);
cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
dim3 dimBlock(32, 32, 1);
CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
a2.elements = new float [a2.row*a2.col];
cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);
for(int j=0; j<a2.col; ++j) {
for(int i=0; i<a2.row; ++i) {
std::cout << a2(i,j) << " ";
}
std::cout << std::endl;
}
}
int main(void)
{
float a[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
float b[6] = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
Matrix W1; W1.row=2; W1.col=3; W1.elements = &a[0];
Matrix Data; Data.row=3; Data.col=2; Data.elements = &b[0];
myFunc(W1, Data);
return 0;
}
and run it, I get this:
>nvcc -arch=sm_21 -Xptxas="-v" -m32 matrix.cu
matrix.cu
tmpxft_000014f4_00000000-5_matrix.cudafe1.gpu
tmpxft_000014f4_00000000-10_matrix.cudafe2.gpu
matrix.cu
ptxas : info : 132 bytes gmem, 28 bytes cmem[14]
ptxas : info : Compiling entry function '_Z10CalculateA6MatrixS_S_' for 'sm_21'
ptxas : info : Function properties for _Z10CalculateA6MatrixS_S_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 14 registers, 68 bytes cmem[0]
tmpxft_000014f4_00000000-5_matrix.cudafe1.cpp
tmpxft_000014f4_00000000-15_matrix.ii
>cuda-memcheck a.exe
========= CUDA-MEMCHECK
2.2 4.9
2.8 6.4
========= ERROR SUMMARY: 0 errors
which is the correct answer for the dot product assuming column major ordering (which is the Matlab convention).
So if your results don't agree, it is because of something you haven't shown us. One likelihood is that your test problem is so large (and kernel so inefficient) that if you are running this on a display GPU, your program is hitting the display driver watchdog timer limit and being killed before the kernel finishes running. Also note that you have no CUDA API error checking whatsoever, so it is possible that you are getting runtime errors which is either stopping your kernel from finishing or even running at all, but you simply don't notice because of the lack of error checking.