8051 - I2C BitBang - i2c

Im using I2C bitbang and Im facing an issue. I think I need a delay between SCL and SDA lines. Im not sure how much the delay needs to be.
Can someone check my I2C code to see if everything checks out? I want to make sure its not a software mistake.
#include <at89c51ic2.h>
#include <stdio.h>
#define SDA P0_0
#define SCL P0_1
void I2CInit(){
SDA = 1;
SCL = 1;
}
void I2CStart(){
SCL = 1;
I2CDelay();
SDA = 0;
I2CDelay();
SCL = 0;
I2CDelay();
}
void I2CRestart(){
SDA = 1;
SCL = 1;
SDA = 0;
SCL = 0;
}
void I2CStop(){
SDA = 0;
I2CDelay();
SCL = 1;
I2CDelay();
SDA = 1;
I2CDelay();
}
void I2CAck(){
I2CDelay();
SDA = 0;
I2CDelay();
SCL = 1;
I2CDelay();
SCL = 0;
I2CDelay();
SDA = 1;
I2CDelay();
}
void I2CNak(){
SDA = 1;
SCL = 1;
SCL = 0;
SDA = 1;
}
void I2CSend(unsigned char Data){
unsigned char i;
for (i = 0; i < 8; i++) {
if ((Data & 0x80) == 0)
SDA = 0;
else
SDA = 1;
SCL = 1;
I2CDelay();
SCL = 0;
I2CDelay();
Data <<= 1;
}
SDA = 1;
I2CDelay();
SCL = 1;
I2CDelay();
if (SDA)
SCL = 0;
I2CDelay();
SDA = 1;
SCL = 1;
}
char I2CRead(unsigned char lastone){ // last one == 1 for last byte; 0 for any other byte
char i, Data=0;
for (i = 0; i < 8; i++){
SCL = 1;
I2CDelay();
if(SDA) Data |=1;
if(i<7) Data<<=1;
SCL = 0;
I2CDelay();
}
SDA = lastone;
SCL = 1;
I2CDelay();
SCL = 0;
SDA = 1;
SCL = 1;
I2CDelay();
return Data;
}
void I2CSendAddr(unsigned char addr, unsigned char rd){
I2CStart();
I2CSend(addr+rd);
}
void I2CDelay(){
unsigned int i;
for (i = 0; i < 4000; i++);
return;
}

Related

SystemVerilog: $urandom_range gives values outside of range

I am getting an odd issue in ModelSim where I set an input variable to a random value in a range, but for some reason, I get a value outside of the range. All my code is included below but the essential line is:
write_addrs[i] = $urandom_range(1,NUM_ARCH_REGS);
In ModelSim, it is being assigned to 0 when it shouldn't (as shown in the waveform; the highlighted signal)...
The thing that is confounding me is that I don't ever set the write_adresses to zero except for when I set the initial signals in the INITIAL_VECTOR_VALUES block. I only modify the variable by using the $urandom_range function with the range explicitly excluding the number zero.
The main blocks of code in which I write to this variable are here:
initial begin : INITIAL_VECTOR_VALUES
advance = 0;
checkpoint = 0;
recover = 0;
write_before_checkpoint = 0;
for (int i = 0; i < NUM_READ_PORTS; i++)
read_addrs[i] = 0;
for (int i = 0; i < NUM_WRITE_PORTS; i++) begin
write_addrs[i] = 0; //<--------------------- HERE!!!
wr_en[i] = 0;
write_data[i] = 0;
commit_en[i] = 0;
commit_data[i] = 0;
commit_addrs[i]= 0;
end
end
... and here:
task random_operations(int repeat_num);
//local vars
int operation_select, num_write, num_commit;
int current_checkpoints;
int loop_idx;
##5;
current_checkpoints = 0; //initialize
$display("Begin Random Operations # %0t", $time());
while (loop_idx < repeat_num) begin
... other stuff ...
//operand select (sets the stimulus inputs)
for (int k = 0; k < num_write; k++) begin
write_addrs[k] = $urandom_range(1,NUM_ARCH_REGS); //<--------------------- HERE!!!
$display("%0t: num_write = %0d",$time(), num_write);
$display("%0t: write_addrs[%0d] = %0d", $time(), k, write_addrs[k]);
write_data[k] = $urandom_range(1,128);
wr_en[k] = 1;
end
...
loop_idx++;
##1;
//reset signals (reset stimulus inputs)
...
end : end_while
endtask : random_operations
Does anyone know why this would be happening?
REFERENCE CODE
`timescale 1ns/1ns
module testbench;
localparam int NUM_CHECKPOINTS = 8;
localparam int NUM_ARCH_REGS = 32;
localparam int NUM_READ_PORTS = 4;
localparam int NUM_WRITE_PORTS = 2;
logic clk;
logic rst;
initial begin : CLOCK_INIT
clk = 1'b0;
forever #5 clk = ~clk;
end
default clocking tb_clk #(posedge clk); endclocking
logic [$clog2(32)-1:0] read_addrs [4];
logic [$clog2(32)-1:0] write_addrs [2];
logic wr_en [2];
logic advance; //moves tail pointer forward
logic checkpoint; //moves head pointer forward
logic recover; //moves head pointer backward (to tail)
logic write_before_checkpoint;
logic [$clog2(32)-1:0] commit_addrs [2];
logic [$clog2(128)-1:0] commit_data [2];
logic commit_en [2];
logic [$clog2(128)-1:0] write_data [2];
logic [$clog2(128)-1:0] read_data [4];
logic [$clog2(128)-1:0] write_evict_data [2];
logic enable_assertions;
cfc_rat dut(.*);
shadow_rat rat_monitor(.*);
initial begin : INITIAL_VECTOR_VALUES
advance = 0;
checkpoint = 0;
recover = 0;
write_before_checkpoint = 0;
for (int i = 0; i < NUM_READ_PORTS; i++)
read_addrs[i] = 0;
for (int i = 0; i < NUM_WRITE_PORTS; i++) begin
write_addrs[i] = 0;
wr_en[i] = 0;
write_data[i] = 0;
commit_en[i] = 0;
commit_data[i] = 0;
commit_addrs[i]= 0;
end
end
task reset();
rst = 1;
##2;
rst = 0;
##1;
endtask : reset
task random_operations(int repeat_num);
//local vars
int operation_select, num_write, num_commit;
int current_checkpoints;
int loop_idx;
##5;
current_checkpoints = 0; //initialize
$display("Begin Random Operations # %0t", $time());
while (loop_idx < repeat_num) begin
operation_select = (loop_idx < 5) ? 1 : ((dut.dfa.chkpt_empty) ? $urandom_range(0,1) : ((dut.dfa.chkpt_full) ? 1 : $urandom_range(0,2)));
num_write = $urandom_range(0,NUM_WRITE_PORTS);
num_commit = $urandom_range(0,NUM_WRITE_PORTS);
case (operation_select)
0: begin //checkpoint
if (current_checkpoints+1 < NUM_CHECKPOINTS) begin
$display("Checkpoint # %0t", $time());
write_before_checkpoint = $urandom_range(0,1);
checkpoint = 1;
current_checkpoints++;
end
else begin
loop_idx--;
continue;
end
end
1: $display("Normal RW # %0t",$time()); //no operation, only read and write
2: begin //advance
if (current_checkpoints > 0) begin
advance = 1;
$display("Advance # %0t", $time());
current_checkpoints--;
end
else begin
loop_idx--;
continue;
end
end
3: begin //recover
$display("Recover # %0t", $time());
recover = 1;
current_checkpoints = 0;
end
default:;
endcase // operation_select
//operand select (sets the stimulus inputs)
for (int k = 0; k < NUM_READ_PORTS; k++)
read_addrs[k] = $urandom_range(0,NUM_ARCH_REGS);
for (int k = 0; k < num_write; k++) begin
write_addrs[k] = $urandom_range(1,NUM_ARCH_REGS);
$display("%0t: num_write = %0d",$time(), num_write);
$display("%0t: write_addrs[%0d] = %0d", $time(), k, write_addrs[k]);
write_data[k] = $urandom_range(1,128);
wr_en[k] = 1;
end
for (int k = 0; k < num_commit; k++) begin
commit_addrs[k] = $urandom_range(1,NUM_ARCH_REGS);
commit_data[k] = $urandom_range(1,128);
commit_en[k] = 1;
end
loop_idx++;
##1;
//reset signals (reset stimulus inputs)
checkpoint = 0;
recover = 0;
advance = 0;
write_before_checkpoint = 0;
for (int i = 0; i < NUM_WRITE_PORTS; i++) begin
write_data[i] = 0;
wr_en[i] = 0;
end
for (int i = 0; i < NUM_READ_PORTS; i++)
read_addrs[i] = 0;
for (int i = 0; i < NUM_WRITE_PORTS; i++) begin
commit_en[i] = 0;
commit_data[i] = 0;
end
end : end_repeat
endtask : random_operations
initial begin : TEST_VECTORS
enable_assertions = 1; //for testing the monitor
reset();
random_operations(5000);
##10;
$display("Finished Successfuly! # %0t",$time());
$finish;
end
endmodule
The problem is that $urandom_range(1, NUM_ARCH_REGS) returns values from 1 to 32. But, write_addrs is declared as logic [4:0], which means it can only take on values from 0 to 31. When $urandom_range returns 32 (which is the same as 6'b10_000), the assignment in your code truncates it to 5 bits, dropping the MSB, and 5'b0_0000 is stored in write_addrs.
To fix this, only allow random values up to 31.
Change:
write_addrs[k] = $urandom_range(1, NUM_ARCH_REGS);
to:
write_addrs[k] = $urandom_range(1, NUM_ARCH_REGS-1);
Here is a complete example to demonstrate the problem:
module tb;
localparam int NUM_ARCH_REGS = 32;
logic [$clog2(32)-1:0] write_addrs [2];
initial begin
repeat (100) begin
for (int k=0; k<2; k++) begin
write_addrs[k] = $urandom_range(1, NUM_ARCH_REGS);
$write("write_addrs[%0d]=%02d ", k, write_addrs[k]);
end
$display;
end
end
endmodule
The problem you see is not specific to ModelSim; I am able to see it on 2 other simulators.

free() and mxFree() in MATLAB - freeing memory twice

Good day,
I have the following code which has given me problems for a day already.
I have debugged it, and it works fine until trying to free the memory. The free() function should be called at the end of the execution automatically, so I commented the mxFree() code out, in hope of getting a result. Even if I do that, the program frees the memory twice, like in the case of manually freeing memory - and thus I conclude that it is beyond my control.
*** glibc detected *** /usr/local/MATLAB/R2012a/bin/glnx86/MATLAB: free(): invalid pointer: 0xad2427a1 ***
Is there something I missed?
Note: I have tried some examples of .mex files with memory allocation and they work fine - so the mistake is down below, in my code.
/*
Beamforming algorithm
Arguments: xr, yr, zr,
t, ts, W, U,
Sdata, NrSensor c, omega_o
Output: S1_sum
S1_sum = beamforming(xr,yr,zr,t,ts,W,U,Sdata,NrSensor,c,omega_o)
*/
#include "mex.h"
#include <stdlib.h>
#include <math.h>
#define INPUT_ARGS 11
#define OUTPUT_ARGS 1
#define Ar(a, b) Ar[b+a*NrSensor]
#define Ai(a, b) Ai[b+a*NrSensor]
#define V(a, b) V[b+a*len_zr]
#define tau(a, b) tau[b+a*NrSensor]
#define tau_s(a, b) tau_s[b+a*len_zr]
#define tau_r(a, b) tau_r[b+a*NrSensor]
#define w(a, b) w[b+a*len_zr]
#define W(a, b) W[b+a*NrSensor]
#define arg(a, b) arg[b+a*len_zr]
#define newrange(a, b) newrange[b+a*len_zr]
#define oldrange(a, b) oldrange[b+a*len_zr]
#define S1_sum_r(a, b, c) S1_sum_r[c+b*len_xr+a*len_yr*len_xr]
#define S1_sum_i(a, b, c) S1_sum_i[c+b*len_xr+a*len_yr*len_xr]
#define Sdata_r(a, b) Sdata_r[b+a*NrSensor]
#define Sdata_i(a, b) Sdata_i[b+a*NrSensor]
#define S1interpr(a, b) S1interpr[b+a*len_zr]
#define S1interpi(a, b) S1interpi[b+a*len_zr]
#define PI_ 3.141592653
double sinc(double x){
return sin(PI_*x)/(PI_*x);
}
void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
{
/*
Declarations
*/
int xr_, yr_, i, j, m;
int len_xr, len_yr, len_zr, len_t,
len_t_, NrSensor, NrSensor_Cen;
double _t, _i, arg_min, arg_max, norm_factor;
double start_tau, end_tau, c, omega_o;
double *xr, *yr, *zr, *t, *ts,
*Sdata_r, *Sdata_i, *W;
double *Ar, *Ai, *V, *tau, *tau_r, *tau_s,
*arg, *U, *w, *t_, *x_r, *x_i, *start_index,
*newrange, *oldrange, *S1_sum_r, *S1_sum_i,
*S1interpi, *S1interpr;
/*
Checking number of arguments
*/
if(nrhs != INPUT_ARGS)
mxErrMsgTxt("Incorrect number of arguments!\n");
if(nlhs != OUTPUT_ARGS)
mxErrMsgTxt("Incorrect number of outputs!\n");
/*
Reading arguments
*/
xr = mxGetPr(prhs[0]);
yr = mxGetPr(prhs[1]);
zr = mxGetPr(prhs[2]);
t = mxGetPr(prhs[3]);
ts = mxGetPr(prhs[4]);
W = mxGetPr(prhs[5]);
U = mxGetPr(prhs[6]);
Sdata_r = mxGetPr(prhs[7]);
Sdata_i = mxGetPi(prhs[7]);
NrSensor = (int) mxGetScalar(prhs[8]);
c = mxGetScalar(prhs[9]);
omega_o = mxGetScalar(prhs[10]);
len_xr = mxGetN(prhs[0]);
len_yr = mxGetN(prhs[1]);
len_zr = mxGetN(prhs[2]);
len_t = mxGetM(prhs[3]);
/*
Initialisations
*/
_t = 0.0;
len_t_ = 0;
NrSensor_Cen = NrSensor/2;
/*
Space allocation and checking
*/
arg = malloc(sizeof(double)*len_zr*len_t);
Ar = malloc(sizeof(double)*len_zr*NrSensor);
Ai = malloc(sizeof(double)*len_zr*NrSensor);
V = malloc(sizeof(double)*len_zr*3);
tau = malloc(sizeof(double)*len_zr*NrSensor);
tau_s = malloc(sizeof(double)*len_zr*3);
tau_r = malloc(sizeof(double)*len_zr*NrSensor);
U = malloc(sizeof(double)*3);
w = malloc(sizeof(double)*len_zr*3);
W = malloc(sizeof(double)*NrSensor*3);
ts = malloc(sizeof(double)*len_zr);
t = malloc(sizeof(double)*len_t);
t_ = malloc(sizeof(double)*len_t);
x_r = malloc(sizeof(double)*len_t);
x_i = malloc(sizeof(double)*len_t);
arg = malloc(sizeof(double)*len_zr*len_t);
newrange = malloc(sizeof(double)*len_zr*len_t);
oldrange = malloc(sizeof(double)*len_zr*len_t);
S1interpr = malloc(sizeof(double)*NrSensor*len_zr);
S1interpi = malloc(sizeof(double)*NrSensor*len_zr);
start_index = malloc(sizeof(double)*len_t);
/*
S1_sum_r = mxMalloc(len_xr*len_yr*len_zr);
S1_sum_i = mxMalloc(len_xr*len_yr*len_zr);
*/
int dim_S1_sum[3] = {len_xr, len_yr, len_zr};
plhs[0] = mxCreateNumericArray(3, dim_S1_sum,
mxDOUBLE_CLASS,
mxCOMPLEX);
S1_sum_r = (double*) mxGetPr(plhs[0]);
S1_sum_i = (double*) mxGetPi(plhs[0]);
if(arg == NULL ||
Ar == NULL ||
Ai == NULL ||
V == NULL ||
tau == NULL ||
tau_s == NULL ||
tau_r == NULL ||
U == NULL ||
W == NULL ||
w == NULL ||
ts == NULL ||
t == NULL ||
t_ == NULL ||
x_r == NULL ||
x_i == NULL ||
start_index == NULL ||
newrange == NULL ||
oldrange == NULL ||
S1_sum_r == NULL ||
S1_sum_i == NULL ||
S1interpr == NULL||
S1interpi == NULL){
mxErrMsgTxt("Malloc error!\n");
return;
}
/*
--- INITIALISING S1interp, S1_sum, tau, full of zeros
*/
for(i=0; i<NrSensor; i++){
for(j=0; j<len_zr; j++){
S1interpr(i,j) = 0;
S1interpi(i,j) = 0;
}
}
for(i=0; i<len_xr; i++){
for(j=0; j<len_yr; j++){
for(m=0; m<len_zr; m++){
S1_sum_r(i,j,m) = 0;
S1_sum_i(i,j,m) = 0;
}
}
}
for(i=0; i<NrSensor; i++){
for(j=0; j<len_zr; j++){
tau(i,j) = 0;
}
}
/*
--- MAIN ALGORITHM ---
*/
for(xr_=0; xr_ < len_xr; xr_++){
for(yr_=0; yr_ < len_yr; yr_++){
for(i=0; i < len_zr; i++){
V(0, i) = xr[xr_];
V(1, i) = yr[yr_];
V(2, i) = zr[i];
}
for(i=0; i < len_zr; i++){
tau_s(0, i) = V(0, i) - U[0];
tau_s(1, i) = V(1, i) - U[1];
tau_s(2, i) = V(2, i) - U[2];
}
for(m=0; m < NrSensor; m++){
for(i=0; i < len_zr; i++){
/*
I see no point of the squeeze function
since W is already of known sizes
*/
w(0, i) = V(0, i) - W(0, m);
w(1, i) = V(1, i) - W(1, m);
w(2, i) = V(2, i) - W(2, m);
}
for(i=0; i< len_zr; i++){
/*
sum(w.*w)
*/
_t = w(0, i)*w(0,i) +
w(1, i)*w(1,i) +
w(2, i)*w(2,i);
tau_r(m, i) = sqrt(_t)/c;
}
}
for(m=0; m < len_zr; m++){
for(i=0; i < NrSensor; i++){
/*
Computing sum(tau_s(m, :).*tau_s(m, :))
*/
_t = tau_s(0, m)*tau_s(0, m) +
tau_s(1, m)*tau_s(1, m) +
tau_s(2, m)*tau_s(2, m);
tau(i, m) = tau_r(i, m) + sqrt(_t)/c;
}
}
/*
for(i=0; i < len_zr; i++){
for(j=0; j < NrSensor; j++){
Tau(i,j)=tau(i,j);
}
}
*/
for(i=0; i < len_zr; i++){
for(j=0; j < NrSensor; j++){
Ar(i,j)=cos(omega_o * tau(i, j));
Ai(i,j)=sin(omega_o * tau(i, j));
}
}
/*
--- BIG LOOP AHEAD ---
*/
for(m=0; m < NrSensor; m++){
start_tau = tau(NrSensor_Cen, 1);
end_tau = tau(NrSensor_Cen, len_zr);
/*
Finding index : writing start indexes
and also t_ array
*/
len_t_=0;
for(i=0; i<len_t; i++){
if(t[i] >= start_tau &&
t[i] <= end_tau){
start_index[len_t_] = i;
t_[len_t_] = t[i];
x_r[len_t_] = Sdata_r(i, m);
x_i[len_t_] = Sdata_i(i, m);
len_t_++;
}
}
for(i=0; i < len_zr; i++){
ts[i]=tau(m, i);
}
for(i=0; i < len_t_; i++){
for(j=0; j < len_zr; j++){
newrange(i, j) = ts[j];
}
}
for(i=0; i < len_zr; i++){
for(j=0; j < len_t_; j++){
oldrange(j, i) = t_[j];
}
}
for(i=0; i < len_t_; i++){
for(j=0; j < len_zr; j++){
arg(i, j) = newrange(i, j)-oldrange(i, j);
}
}
arg_min = arg[0];
for(i=0; i < len_t_; i++)
for(j=0; j < len_zr; j++)
if(arg_min>arg(i, j))
arg_min=arg(i, j);
arg_max = arg[0];
for(i=0; i < len_t_; i++)
for(j=0; j < len_zr; j++)
if(arg_max<arg(i, j))
arg_max=arg(i, j);
norm_factor = (2*len_t_)/(arg_max-arg_min);
for(i=0; i < len_zr; i++){
_t = 0;
for(j=0; j < NrSensor; j++){
_t = sinc(arg(i, j)*norm_factor)*x_r[j];
}
S1interpr(m, i) = Ar(m, i) * _t;
_t = 0;
for(j=0; j < NrSensor; j++){
_t = sinc(arg(i, j)*norm_factor)*x_i[j];
}
S1interpi(m, i) = Ai(m, i) * _t;
}
}
for(i=0; i < len_zr; i++){
_t = 0;
for(j=0; j < NrSensor; j++){
_t += S1interpr(j, i);
}
S1_sum_r(xr_, yr_, i) = _t;
_t = 0;
for(j=0; j < NrSensor; j++){
_t += S1interpi(j, i);
}
S1_sum_i(xr_, yr_, i) = _t;
}
}
}
free(arg);
free(Ar);
free(Ai);
free(V);
free(tau);
free(tau_s);
free(tau_r);
free(U);
free(w);
free(W);
free(ts);
free(t);
free(t_);
free(x_r);
free(x_i);
free(newrange);
free(oldrange);
free(S1interpr);
free(S1interpi);
free(start_index);
return;
}
EDIT
I have deleted all my code except the beginning (up to the memory allocation) and the memory freeing. I'm also using malloc() and free() now. In between the memory allocation and freeing, I have also put this code:
for(i=0; i<NrSensor; i++){
for(j=0; j<len_zr; j++){
S1interpr(i,j) = 0;
S1interpi(i,j) = 0;
}
}
for(i=0; i<NrSensor; i++){
for(j=0; j<len_zr; j++){
tau(i,j) = 0;
}
}
The first loop causes no problem. The second one though, apparently corrupts the variable V (declared before it) and the one declared after it. And it seems, according to my logic, that it does not exceed any kind of bounds...
The second loop does not index correctly for tau. You are defining indexing for tau as
#define tau(a, b) tau[b+a*NrSensor]
Let us walk through the second loop assuming NrSensor = 10 and len_zr = 5. For this case max value of loop variable i is 9 and max value of loop variable j is 4. Now,
tau(9,4) => tau[4+9*10] => tau[94].
But you are allocating tau with
tau = malloc(sizeof(double)*len_zr*NrSensor);
which for the sample values 10 and 5 is
tau = malloc(sizeof(double)*50)
You either need to change the indexing definition for tau to swap a and b or change the order of loops i and j.

Renderscript hangs device

I am implementing part of FFT algorithm using Renderscript in Android, When I ran the code my application got hanged. I want to process 512 values from real and img allocation at a time. Kernel will execute 512 times using provided dummy allocation of size 512.
Here is my java code
RenderScript rs = RenderScript.create(WajinViewerApplication
.getApplication());
ScriptC_fft scriptC_fft = new ScriptC_fft(rs);
float inReal[] = new float[512 * 512];
float inImg[] = new float[512 * 512];
int k = 0;
for (int i = 0; i < 512; i++) {
for (int j = 0; j < 512; j++) {
// copy values from complex 2d array to 1d array
inReal[k] = data[i][j].real;
inImg[k] = data[i][j].imaginary;
k++;
}
}
Allocation realAllocation = Allocation.createSized(rs, Element.F32(rs),
512 * 512);
Allocation imgAllocation = Allocation.createSized(rs, Element.F32(rs),
512 * 512);
realAllocation.copyFrom(inReal);
imgAllocation.copyFrom(inImg);
scriptC_fft.set_real(realAllocation);
scriptC_fft.set_img(imgAllocation);
Allocation inAllocation = Allocation.createSized(rs, Element.U16(rs),
512);
Allocation outAllocation = Allocation.createTyped(rs,
inAllocation.getType());
inAllocation.copyFrom(new short[512]);
// set direction
if (direction == Direction.Forward) {
scriptC_fft.set_is_forward(true);
} else {
scriptC_fft.set_is_forward(false);
}
scriptC_fft.set_len(512);
scriptC_fft.set_levels(Integer.numberOfLeadingZeros(512));
scriptC_fft.forEach_root(inAllocation, outAllocation);
outAllocation.copyTo(new short[512]);
float outReal[] = new float[512 * 512];
float outImg[] = new float[512 * 512];
scriptC_fft.get_real().copyTo(outReal);
scriptC_fft.get_img().copyTo(outImg);
k = 0;
for (int i = 0; i < 512; i++) {
for (int j = 0; j < 512; j++) {
// copy values from complex 1d array to 2d array
data[i][j].real = outReal[k];
data[i][j].imaginary = outImg[k];
k++;
}
}
rs.destroy();
And here is my Renderscript code
#pragma version(1)
#pragma rs java_package_name(jp.drmh.wajin.newversion)
#include "common.rsh"
rs_allocation real;
rs_allocation img;
bool is_forward;
uint32_t len;
uint32_t levels;
uint16_t __attribute__((kernel)) root(uint16_t in, uint32_t x, uint32_t y){
// rsDebug("call",x);
float realval[512];
float imagval[512];
if(is_forward){
for (uint32_t i = 0; i < len; i++) {
realval[i]=rsGetElementAt_float(real,x*512+i);
imagval[i]=rsGetElementAt_float(img,x*512+i);
//rsDebug("values", realval[i], imagval[i]);
}
}else{
for (uint32_t i = 0; i < len; i++) {
realval[i]=rsGetElementAt_float(img,x*512+i);
imagval[i]=rsGetElementAt_float(real,x*512+i);
}
}
float costable[256],sintable[256];
for (uint32_t i = 0; i < len / 2; i++) {
costable[i]=cos(2 * M_PI * i / len);
sintable[i]=sin(2 * M_PI * i / len);
}
// Bit-reversed addressing permutation
for (uint32_t i = 0; i < len; i++) {
uint32_t j = bit_reverse32(i);
uint32_t ans=j>>(32 - levels);
if (j > i) {
float temp = realval[i];
realval[i] = realval[j];
realval[j] = temp;
temp = imagval[i];
imagval[i] = imagval[j];
imagval[j] = temp;
}
}
for (uint32_t size = 2; size <= len; size *= 2) {
uint32_t halfsize = size / 2;
uint32_t tablestep = len / size;
for (uint32_t i = 0; i < len; i += size) {
for (uint32_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
float tpre=realval[j + halfsize] * costable[k]
+ imagval[j + halfsize] * sintable[k];
float tpim = -realval[j + halfsize] * sintable[k]
+ imagval[j + halfsize] * costable[k];
realval[j + halfsize] = realval[j] - tpre;
imagval[j + halfsize] = imagval[j] - tpim;
realval[j] += tpre;
imagval[j] += tpim;
}
}
if (size == len)
break;
}
if(!is_forward){
for(uint32_t i = 0; i < len; i++){
realval[i]=realval[i]/len;
imagval[i]=imagval[i]/len;
rsDebug("values", realval[i], imagval[i]);
}
for (uint32_t i = 0; i < len; i++) {
rsSetElementAt_float(real, realval[i], x*512+i);
rsSetElementAt_float(img, imagval[i], x*512+i);
}
}
return in;
}

Where is the huge performance difference for the two versions of the code?

I am working on a problem that Given a string s, partitions s such that every substring of the partition is a palindrome.
Return the minimum cuts needed for a palindrome partitioning of s. The problem can also be found in here. https://oj.leetcode.com/problems/palindrome-partitioning-ii/
Version 1 is one version of solution I found online.
Version 2 is my code.
They both seem to work in very similar ways. However, with a reasonably large input, version 2 takes more than 6000 milliseconds whereas version 1 takes around 71 milliseconds.
Can anyone provide any idea where the time difference is from?
Version 1:
int minSol(string s) {
int len = s.size();
vector<int> D(len + 1);
vector<vector<int>> P;
for (int i = 0; i < len; i++){
vector<int> t(len);
P.push_back(t);
}
for (int i = 0; i <= len; i++)
D[i] = len - i;
for (int i = 0; i < len; i++)
for (int j = 0; j < len; j++)
P[i][j] = false;
for (int i = len - 1; i >= 0; i--){
for (int j = i; j < len; j++){
if (s[i] == s[j] && (j - i < 2 || P[i + 1][j - 1])){
P[i][j] = true;
D[i] = min(D[i], D[j + 1] + 1);
}
}
}
return D[0] - 1;
}
Version 2:
int minCut(string s) {
int size = s.size();
vector<vector<bool>> map;
for (int i = 0; i < size; i++){
vector<bool> t;
for (int j = 0; j < size; j++){
t.push_back(false);
}
map.push_back(t);
}
vector<int> minCuts;
for (int i = 0; i < size; i++){
map[i][i] = true;
minCuts.push_back(size - i - 1);
}
for (int i = size - 1; i >= 0; i--){
for (int j = size - 1; j >= i; j--){
if (s[i] == s[j] && (j - i <= 1 || map[i + 1][j - 1])){
map[i][j] = true;
if (j == size - 1){
minCuts[i] = 0;
}else if (minCuts[i] > minCuts[j + 1] + 1){
minCuts[i] = minCuts[j + 1] + 1;
}
}
}
}
return minCuts[0];
}
I would guess it's because in the second version you're doing size^2 push_back's, whereas in the first version you're just doing size push_back's.

mex memory allocation mxRealloc

I have to read a matrix of double, handling its values and insert them in a new matrix, whose one of its dimension is unkwown at the beginning.
In a static memory allocation, my code is:
#include <mex.h>
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
const mxArray *I = prhs[0];
double *indata = mxGetPr(I);
double *submtx = mxGetPr(prhs[1]);
const int *size = mxGetDimensions(I);
int xp = (int)submtx[0], yp = (int)submtx[1], zp = (int)submtx[2];
int xi = size[0], yi = size[1], zi = size[2];
int numsubmtx = (xi - xp + 1)*(yi - yp + 1)*(zi - zp + 1);
int out_rows = xp*yp*zp;
int out_cols = numsubmtx;
mxArray *out = mxCreateDoubleMatrix( out_rows, out_cols, mxREAL );
double *outdata = mxGetPr(out);
int submtx_counter = 0;
for( int z_offset = 0; ...; z_offset++ ){
for( int y_offset = 0; ...; y_offset++ ){
for( int x_offset = 0; ...; x_offset++ ){
int row = 0;
for( int z_counter = 0; ...; z_counter++ ){
for( int y_counter = 0; ...; y_counter++ ){
for( int x_counter = 0; ...; x_counter++ ){
outdata[submtx_counter*out_rows + row] =
indata[ (x_offset+x_counter) + (y_offset+y_counter)*xi + (z_offset+z_counter)*xi*yi ];
++row;
}}}
++submtx_counter;
}}}
plhs[0] = out;
}
In a dynamic version, I do not know the value of out_cols, so I have to reallocate *out when a condition on the values of indata is satisfied.
My idea is something like this:
#include <mex.h>
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
const mxArray *I = prhs[0];
double *indata = mxGetPr(I);
double *submtx = mxGetPr(prhs[1]);
const int *size = mxGetDimensions(I);
int xp = (int)submtx[0], yp = (int)submtx[1], zp = (int)submtx[2];
int xi = size[0], yi = size[1], zi = size[2];
int numsubmtx = (xi - xp + 1)*(yi - yp + 1)*(zi - zp + 1);
int out_rows = xp*yp*zp;
//int out_cols = numsubmtx; NOW UNKNOWN!
mxArray *out = NULL;
double *outdata = mxGetPr(out);
int submtx_counter = 0;
for( int z_offset = 0; ...; z_offset++ ){
for( int y_offset = 0; ...; y_offset++ ){
for( int x_offset = 0; ...; x_offset++ ){
int row = 0;
double condition=0;
for( int z_counter = 0; ...; z_counter++ ){
for( int y_counter = 0; ...; y_counter++ ){
for( int x_counter = 0; ...; x_counter++ ){
condition += indata[ (x_offset+x_counter) + (y_offset+y_counter)*xi + (z_offset+z_counter)*xi*yi ]/(xp*yp*zp);
++row;
}}}
if(coundition>0.5){
out = mxRealloc(out, (submtx_counter+1)*out_rows*sizeof(double));
double *outdata = mxGetPr(out);
int row = 0;
for( int z_counter = 0; ...; z_counter++ ){
for( int y_counter = 0; ...; y_counter++ ){
for( int x_counter = 0; ...; x_counter++ ){
outdata[submtx_counter*out_rows + row] = indata[ (x_offset+x_counter) + (y_offset+y_counter)*xi + (z_offset+z_counter)*xi*yi ];
++row;
}}}
++submtx_counter;
}
}}}
plhs[0] = out;
}
What is wrong?
I haven't looked closely at the logic of your code, but I can see the following problems:
mxArray *out = NULL;
double *outdata = mxGetPr(out);
and
out = mxRealloc(out, (submtx_counter+1)*out_rows*sizeof(double));
will have strange behavior. An mxArray is a structure, and you want to reallocate memory for the data pointed to by the structure rather than the structure itself. Instead, try
double *outdata = mxMalloc(out_rows*sizeof(double));
and
outdata = mxRealloc(outdata, (submtx_counter+1)*out_rows*sizeof(double));
and then at the very end, create your output matrix:
mxArray *out = mxCreateDoubleMatrix(out_rows, submtx_counter, mxREAL);
mxSetPr(out, outdata);
plhs[0] = out;
This will guarantee that the metadata in out gives you a matrix of the correct size, etc.