Related
I am trying to execute a simple code from RAM, but for some reason the program halts/throws hard fault. I am using CCMRAM for my data, heap and stack while SRAM1 for executing code. Here is my linker script and startup file.
LinkerScript.ld
/*
******************************************************************************
**
** File : LinkerScript.ld
**
** Author : Auto-generated by Ac6 System Workbench
**
** Abstract : Linker script for STM32F407ZETx Device from STM32F4 series
** 112Kbytes SRAM1
** 16Kbytes SRAM2
** 64Kbytes CCMRAM
** 512Kbytes ROM
**
** Set heap size, stack size and stack location according
** to application requirements.
**
** Set memory bank area and size if external memory is used.
**
** Target : STMicroelectronics STM32
**
** Distribution: The file is distributed �as is,� without any warranty
** of any kind.
**
*****************************************************************************
** #attention
**
** <h2><center>© COPYRIGHT(c) 2019 Ac6</center></h2>
**
** Redistribution and use in source and binary forms, with or without modification,
** are permitted provided that the following conditions are met:
** 1. Redistributions of source code must retain the above copyright notice,
** this list of conditions and the following disclaimer.
** 2. Redistributions in binary form must reproduce the above copyright notice,
** this list of conditions and the following disclaimer in the documentation
** and/or other materials provided with the distribution.
** 3. Neither the name of Ac6 nor the names of its contributors
** may be used to endorse or promote products derived from this software
** without specific prior written permission.
**
** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
** DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
** SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
** CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
** OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
** OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**
*****************************************************************************
*/
/* Entry Point */
ENTRY(Reset_Handler)
/* Highest address of the user mode stack */
/*_estack = 0x2001C000; /* start of SRAM2 */
_estack = 0x10010000; /* end of CCMRAM */
_Min_Heap_Size = 0; /* required amount of heap */
_Min_Stack_Size = 0x400; /* required amount of stack */
/* Memories definition */
MEMORY
{
SRAM1 (xrw) : ORIGIN = 0x20000000, LENGTH = 112K
SRAM2 (xrw) : ORIGIN = 0x2001C000, LENGTH = 16K
FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 512K
CCMRAM (rw) : ORIGIN = 0x10000000, LENGTH = 64K
}
/* Sections */
SECTIONS
{
/* The startup code into ROM memory */
.isr_vector :
{
. = ALIGN(4);
KEEP(*(.isr_vector)) /* Startup code */
. = ALIGN(4);
} >FLASH
/*sram1 section for code and data*/
_sisram1 = LOADADDR(.sram1);
.sram1 :
{
. = ALIGN(4);
_s_sram1 = .;
*(.sram1);
*(.sram1*);
. = ALIGN(4);
_e_sram1 = .;
} > SRAM1 AT >FLASH
/*sram2 section for code and data*/
_sisram2 = LOADADDR(.sram2);
.sram2 :
{
. = ALIGN(4);
_s_sram2 = .;
*(.sram2);
*(.sram2*);
. = ALIGN(4);
_e_sram2 = .;
} > SRAM2 AT >FLASH
/* The program code and other data into FLASH memory */
.text :
{
. = ALIGN(4);
*(.text) /* .text sections (code) */
*(.text*) /* .text* sections (code) */
*(.glue_7) /* glue arm to thumb code */
*(.glue_7t) /* glue thumb to arm code */
*(.eh_frame)
KEEP (*(.init))
KEEP (*(.fini))
. = ALIGN(4);
_etext = .; /* define a global symbols at end of code */
} >FLASH
/* Constant data into FLASH memory*/
.rodata :
{
. = ALIGN(4);
*(.rodata) /* .rodata sections (constants, strings, etc.) */
*(.rodata*) /* .rodata* sections (constants, strings, etc.) */
. = ALIGN(4);
} >FLASH
.ARM.extab : {
. = ALIGN(4);
*(.ARM.extab* .gnu.linkonce.armextab.*)
. = ALIGN(4);
} > FLASH
.ARM : {
. = ALIGN(4);
__exidx_start = .;
*(.ARM.exidx*)
__exidx_end = .;
. = ALIGN(4);
} >FLASH
.preinit_array :
{
. = ALIGN(4);
PROVIDE_HIDDEN (__preinit_array_start = .);
KEEP (*(.preinit_array*))
PROVIDE_HIDDEN (__preinit_array_end = .);
. = ALIGN(4);
} >FLASH
.init_array :
{
. = ALIGN(4);
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT(.init_array.*)))
KEEP (*(.init_array*))
PROVIDE_HIDDEN (__init_array_end = .);
. = ALIGN(4);
} >FLASH
.fini_array :
{
. = ALIGN(4);
PROVIDE_HIDDEN (__fini_array_start = .);
KEEP (*(SORT(.fini_array.*)))
KEEP (*(.fini_array*))
PROVIDE_HIDDEN (__fini_array_end = .);
. = ALIGN(4);
} >FLASH
/* Used by the startup to initialize data */
_sidata = LOADADDR(.data);
/* Initialized data sections into RAM memory */
.data :
{
. = ALIGN(4);
_sdata = .; /* create a global symbol at data start */
*(.data) /* .data sections */
*(.data*) /* .data* sections */
. = ALIGN(4);
_edata = .; /* define a global symbol at data end */
} >CCMRAM AT> FLASH
/* Uninitialized data section into RAM memory */
. = ALIGN(4);
.bss :
{
/* This is used by the startup in order to initialize the .bss secion */
_sbss = .; /* define a global symbol at bss start */
__bss_start__ = _sbss;
*(.bss)
*(.bss*)
*(COMMON)
. = ALIGN(4);
_ebss = .; /* define a global symbol at bss end */
__bss_end__ = _ebss;
} >CCMRAM
/* User_heap_stack section, used to check that there is enough RAM left */
._user_heap_stack :
{
. = ALIGN(8);
PROVIDE ( end = . );
PROVIDE ( _end = . );
. = . + _Min_Heap_Size;
. = . + _Min_Stack_Size;
. = ALIGN(8);
} >CCMRAM
/* Remove information from the compiler libraries */
/DISCARD/ :
{
libc.a ( * )
libm.a ( * )
libgcc.a ( * )
}
.ARM.attributes 0 : { *(.ARM.attributes) }
}
And here is part of my startup file
/**
******************************************************************************
* #file startup_stm32.s dedicated to STM32F407ZETx device
* #author Ac6
* #version V1.0.0
* #date 2019-03-30
******************************************************************************
*/
.syntax unified
.cpu cortex-m4
.fpu softvfp
.thumb
.global g_pfnVectors
.global Default_Handler
/* start address for the initialization values of the .data section.
defined in linker script */
.word _sidata
/* start address for the .data section. defined in linker script */
.word _sdata
/* end address for the .data section. defined in linker script */
.word _edata
/* start address for the .bss section. defined in linker script */
.word _sbss
/* end address for the .bss section. defined in linker script */
.word _ebss
/**
* #brief This is the code that gets called when the processor first
* starts execution following a reset event. Only the absolutely
* necessary set is performed, after which the application
* supplied main() routine is called.
* #param None
* #retval : None
*/
.section .text.Reset_Handler
.weak Reset_Handler
.type Reset_Handler, %function
Reset_Handler:
ldr r0, =_estack
mov sp, r0 /* set stack pointer */
/* Copy the data segment initializers from flash to SRAM */
ldr r0, =_sdata
ldr r1, =_edata
ldr r2, =_sidata
movs r3, #0
b LoopCopyDataInit
CopyDataInit:
ldr r4, [r2, r3]
str r4, [r0, r3]
adds r3, r3, #4
LoopCopyDataInit:
adds r4, r0, r3
cmp r4, r1
bcc CopyDataInit
/* Zero fill the bss segment. */
ldr r2, =_sbss
ldr r4, =_ebss
movs r3, #0
b LoopFillZerobss
FillZerobss:
str r3, [r2]
adds r2, r2, #4
LoopFillZerobss:
cmp r2, r4
bcc FillZerobss
///////////// Following blocks are for SRAM1 /////////////////////
// Copy from flash to SRAM1
ldr r0, =_s_sram1
ldr r1, =_e_sram1
ldr r2, =_sisram1
movs r3, #0
b LoopCopySRAM1Init
CopySRAM1Init:
ldr r4, [r2, r3]
str r4, [r0, r3]
adds r3, r3, #4
LoopCopySRAM1Init:
adds r4, r0, r3
cmp r4, r1
bcc CopySRAM1Init
// End of data copy from Flash to SRAM1
// Zero fill the SRAM1 segment.
ldr r2, =_s_sram1
b LoopFillZeroSRAM1
FillZeroSRAM1:
movs r3, #0
str r3, [r2]
adds r2, r2, #4
LoopFillZeroSRAM1:
ldr r3, = _e_sram1
cmp r2, r3
bcc FillZeroSRAM1
//////////////// End of SRAM1 Blocks /////////////////
///////////// Following blocks are for SRAM2 /////////////////////
// Copy from flash to SRAM2
ldr r0, =_s_sram2
ldr r1, =_e_sram2
ldr r2, =_sisram2
movs r3, #0
b LoopCopySRAM2Init
CopySRAM2Init:
ldr r4, [r2, r3]
str r4, [r0, r3]
adds r3, r3, #4
LoopCopySRAM2Init:
adds r4, r0, r3
cmp r4, r1
bcc CopySRAM2Init
// End of data copy from Flash to SRAM2
// Zero fill the SRAM2 segment.
ldr r2, =_s_sram2
b LoopFillZeroSRAM2
FillZeroSRAM2:
movs r3, #0
str r3, [r2]
adds r2, r2, #4
LoopFillZeroSRAM2:
ldr r3, = _e_sram2
cmp r2, r3
bcc FillZeroSRAM2
//////////////// End of SRAM2 Blocks /////////////////
/* Call the clock system intitialization function.*/
bl SystemInit
/* Call static constructors */
bl __libc_init_array
/* Call the application's entry point.*/
bl main
LoopForever:
b LoopForever
Can anybody please help me if there is any bug here. Lets say, for now we are only trying to run code from SRAM1 since that is the one which is connected to I-bus and D-bus ports of the Cortex M4 on this MCU. With respect to the picture below, I would expect bl to be jumping to beginning of SRAM1.. Can you please shed some light on it:
According to the screen captures, your code is jumping to SRAM (as expected) but there is no code in SRAM (MOV R0 R0 indicates the memory is filled with 0s) hence the hardFault.
You have to copy from Flash to RAM the code that you want to execute in RAM.
I'm trying to access the SysTick timer of Cortex-M3 so I've to switch to priviledged mode. I'm doing it as
/* Active previlige mode */
asm ("mov r0, #0x0");
asm ("msr control, r0");
asm ("ISB");
But it's not working because I'm unable to write the SYST_CSR register. Any exception entry is required to perform this operation if YES, how?
You cannot raise the mode to privileged directly from user mode (you can change to user mode direct from privileged mode). You have to do it via an SVC call (Supervisor call).
How you raise an SVC call will depend on your compiler if you do it in C, however in assembler you could use asm("svc, #1");
The #1 can be any number. This is made available to the SVC handler. If you want to only use the SVC handler for this purpose only then you don't need to decode the number in the handler and can simply use you assembly above to raise the privilege. However if you want to use the SVC for more than one purpose then you need to decode the number, so that #1 is for raising the privilege, #2 is for doing something else etc. The main thing to know here is that the SVC number will be on the stack you were using when the call was made (either the msp or psp). If you were only ever using one stack then it is easier. You will have to look up the stack frame in user guides.
So you need to Implement an SVC handler. You should be to find some examples on the web. There is a good example in the "Definitive Guide to ARM Cortex-M3 and Cortex M4" book.
Janathan Valvano has SysTick timer code example and other stuff at http://users.ece.utexas.edu/~valvano/arm/#Timer
; SysTickInts.s
; Runs on LM4F120/TM4C123
; Use the SysTick timer to request interrupts at a particular period.
; Daniel Valvano
; September 11, 2013
; This example accompanies the book
; "Embedded Systems: Introduction to ARM Cortex M Microcontrollers"
; ISBN: 978-1469998749, Jonathan Valvano, copyright (c) 2013
; Volume 1, Program 9.7
; "Embedded Systems: Real Time Interfacing to ARM Cortex M Microcontrollers",
; ISBN: 978-1463590154, Jonathan Valvano, copyright (c) 2013
; Volume 2, Program 5.12, section 5.7
;
;Copyright 2013 by Jonathan W. Valvano, valvano#mail.utexas.edu
; You may use, edit, run or distribute this file
; as long as the above copyright notice remains
;THIS SOFTWARE IS PROVIDED "AS IS". NO WARRANTIES, WHETHER EXPRESS, IMPLIED
;OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF
;MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE APPLY TO THIS SOFTWARE.
;VALVANO SHALL NOT, IN ANY CIRCUMSTANCES, BE LIABLE FOR SPECIAL, INCIDENTAL,
;OR CONSEQUENTIAL DAMAGES, FOR ANY REASON WHATSOEVER.
;For more information about my classes, my research, and my books, see
;http://users.ece.utexas.edu/~valvano/
NVIC_ST_CTRL_R EQU 0xE000E010
NVIC_ST_RELOAD_R EQU 0xE000E014
NVIC_ST_CURRENT_R EQU 0xE000E018
NVIC_ST_CTRL_COUNT EQU 0x00010000 ; Count flag
NVIC_ST_CTRL_CLK_SRC EQU 0x00000004 ; Clock Source
NVIC_ST_CTRL_INTEN EQU 0x00000002 ; Interrupt enable
NVIC_ST_CTRL_ENABLE EQU 0x00000001 ; Counter mode
NVIC_ST_RELOAD_M EQU 0x00FFFFFF ; Counter load value
NVIC_SYS_PRI3_R EQU 0xE000ED20 ; Sys. Handlers 12 to 15 Priority
AREA |.text|, CODE, READONLY, ALIGN=2
THUMB
EXPORT SysTick_Init
; **************SysTick_Init*********************
; Initialize SysTick periodic interrupts, priority 2
; Input: R0 interrupt period
; Units of period are 1/clockfreq
; Maximum is 2^24-1
; Minimum is determined by length of ISR
; Output: none
; Modifies: R0, R1, R2, R3
SysTick_Init
; start critical section
MRS R3, PRIMASK ; save old status
CPSID I ; mask all (except faults)
; disable SysTick during setup
LDR R1, =NVIC_ST_CTRL_R ; R1 = &NVIC_ST_CTRL_R (pointer)
MOV R2, #0
STR R2, [R1] ; disable SysTick
; maximum reload value
LDR R1, =NVIC_ST_RELOAD_R ; R1 = &NVIC_ST_RELOAD_R (pointer)
SUB R0, R0, #1 ; counts down from RELOAD to 0
STR R0, [R1] ; establish interrupt period
; any write to current clears it
LDR R1, =NVIC_ST_CURRENT_R ; R1 = &NVIC_ST_CURRENT_R (pointer)
STR R2, [R1] ; writing to counter clears it
; set NVIC system interrupt 15 to priority 2
LDR R1, =NVIC_SYS_PRI3_R ; R1 = &NVIC_SYS_PRI3_R (pointer)
LDR R2, [R1] ; friendly access
AND R2, R2, #0x00FFFFFF ; R2 = R2&0x00FFFFFF (clear interrupt 15 priority)
ORR R2, R2, #0x40000000 ; R2 = R2|0x40000000 (interrupt 15 priority is in bits 31-29)
STR R2, [R1] ; set SysTick to priority 2
; enable SysTick with core clock
LDR R1, =NVIC_ST_CTRL_R ; R1 = &NVIC_ST_CTRL_R
; ENABLE SysTick (bit 0), INTEN enable interrupts (bit 1), and CLK_SRC (bit 2) is internal
MOV R2, #(NVIC_ST_CTRL_ENABLE+NVIC_ST_CTRL_INTEN+NVIC_ST_CTRL_CLK_SRC)
STR R2, [R1] ; store a 7 to NVIC_ST_CTRL_R
; end critical section
MSR PRIMASK, R3 ; restore old status
BX LR ; return
ALIGN ; make sure the end of this section is aligned
END ; end of file
I am trying to calculate clock cycles of an assembly program consumed during execution through PMU in ARM1176JZF-S.
I am programming rapberry pi model b in assembly using baking-pi tutorials. The code for PMU that I am using is as below:
/* Enable PMU /
mov r0,#1
MCR p15, 0, r0, c15, c12, 0 ; Write Performance Monitor Control Register
/ Reset Cycle Counter /
mov r0,#5
MCR p15, 0, r0, c15, c12, 0 ; Write Performance Monitor Control Register
/ Meaure */
MRC p15, 0, r0, c15, c12, 1 # Read Cycle Counter Register
MRC p15, 0, r1, c15, c12, 1 # Read Cycle Counter Register
sub r0.r1.r0 # Cycle Count of
Is there any way I can save this data in a text or csv file on my SD card (value of r0)? so I can view my results after code execution.
I'm kinda new to the beaglebone black world running on a AM335X Cortex A8 processor and I would like to use the PRU for fast analog read with the maximum sampling rate possible.
I would like to read all 7 inputs in a loop form like:
while( n*7 < sampling_rate){ //initial value for n = 0
read(AIN0); //and store it in shared memory(7*n + 0)
read(AIN1); //and store it in shared memory(7*n + 1)
read(AIN2); //and store it in shared memory(7*n + 2)
read(AIN3); //and store it in shared memory(7*n + 3)
read(AIN4); //and store it in shared memory(7*n + 4)
read(AIN5); //and store it in shared memory(7*n + 5)
read(AIN6); //and store it in shared memory(7*n + 6)
n++;
}
so that I can read them from a host program running on the main processor. Any idea how to do so? I tried using a ready code called ADCCollector.c from a package named AM335x_pru_package but I can't figure out how to get all the addresses and values of the registers used.
This is the code I was trying to modify (ADCCollector.p):
.origin 0 // offset of the start of the code in PRU memory
.entrypoint START // program entry point, used by debugger only
#include "ADCCollector.hp"
#define BUFF_SIZE 0x00000fa0 //Total buff size: 4kbyte(Each buffer has 2kbyte: 500 piece of data
#define HALF_SIZE BUFF_SIZE / 2
#define SAMPLING_RATE 1 //Sampling rate(16khz) //***//16000
#define DELAY_MICRO_SECONDS (1000000 / SAMPLING_RATE) //Delay by sampling rate
#define CLOCK 200000000 // PRU is always clocked at 200MHz
#define CLOCKS_PER_LOOP 2 // loop contains two instructions, one clock each
#define DELAYCOUNT DELAY_MICRO_SECONDS * CLOCK / CLOCKS_PER_LOOP / 1000 / 1000 * 3 //if sampling rate = 98000 --> = 3061.224
.macro DELAY
MOV r10, DELAYCOUNT
DELAY:
SUB r10, r10, 1
QBNE DELAY, r10, 0
.endm
.macro READADC
//Initialize buffer status (0: empty, 1: first buffer is ready, 2: second buffer is ready)
MOV r2, 0
SBCO r2, CONST_PRUSHAREDRAM, 0, 4
INITV:
MOV r5, 0 //Shared RAM address of ADC Saving position
MOV r6, BUFF_SIZE //Counting variable
READ:
//Read ADC from FIFO0DATA
MOV r2, 0x44E0D100
LBBO r3, r2, 0, 4
//Add address counting
ADD r5, r5, 4
//Write ADC to PRU Shared RAM
SBCO r3, CONST_PRUSHAREDRAM, r5, 4
DELAY
SUB r6, r6, 4
MOV r2, HALF_SIZE
QBEQ CHBUFFSTATUS1, r6, r2 //If first buffer is ready
QBEQ CHBUFFSTATUS2, r6, 0 //If second buffer is ready
QBA READ
//Change buffer status to 1
CHBUFFSTATUS1:
MOV r2, 1
SBCO r2, CONST_PRUSHAREDRAM, 0, 4
QBA READ
//Change buffer status to 2
CHBUFFSTATUS2:
MOV r2, 2
SBCO r2, CONST_PRUSHAREDRAM, 0, 4
QBA INITV
//Send event to host program
MOV r31.b0, PRU0_ARM_INTERRUPT+16
HALT
.endm
// Starting point
START:
// Enable OCP master port
LBCO r0, CONST_PRUCFG, 4, 4 //#define CONST_PRUCFG C4 taken from ADCCollector.hp
CLR r0, r0, 4
SBCO r0, CONST_PRUCFG, 4, 4
//C28 will point to 0x00012000 (PRU shared RAM)
MOV r0, 0x00000120
MOV r1, CTPPR_0
ST32 r0, r1
//Init ADC CTRL register
MOV r2, 0x44E0D040
MOV r3, 0x00000005
SBBO r3, r2, 0, 4
//Enable ADC STEPCONFIG 1
MOV r2, 0x44E0D054
MOV r3, 0x00000002
SBBO r3, r2, 0, 4
//Init ADC STEPCONFIG 1
MOV r2, 0x44E0D064
MOV r3, 0x00000001 //continuous mode
SBBO r3, r2, 0, 4
//Read ADC and FIFOCOUNT
READADC
Another question is: if I simply changed the #define Sampling_rate from 16000 to any other number below or equal to 200000 in the (.p) file, I will get that sampling rate? or should I change other things?
Thanks in advance.
I used the c wrappers from libpruio: http://www.freebasic.net/forum/viewtopic.php?f=14&t=22501
and then use this code to get all my ADC values:
#include "stdio.h"
#include "c_wrapper/pruio.h" // include header
#include "sys/time.h"
//! The main function.
int main(int argc, char **argv) {
struct timeval start, now;
long mtime, seconds, useconds;
gettimeofday(&start, NULL);
int i,x;
pruIo *io = pruio_new(PRUIO_DEF_ACTIVE, 0x98, 0, 1); //! create new driver structure
if (pruio_config(io, 1, 0x1FE, 0, 4)){ // upload (default) settings, start IO mode
printf("config failed (%s)\n", io->Errr);}
else {
do {
gettimeofday(&now, NULL);
seconds = now.tv_sec - start.tv_sec;
useconds = now.tv_usec - start.tv_usec;
mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
printf("%lu",mtime);
for(i = 1; i < 9; i++) {
printf(",%d", io->Adc->Value[i]); //0-66504 for 0-1.8v
}
printf("\n");
x++;
}while (mtime < 100);
printf("count: %d \n", x);
pruio_destroy(io); /* destroy driver structure */
}
return 0;
}
In your example you use libpruio in IO mode (synchronous) and therefore you have no control over the sampling rate, since the host CPU doesn't work in real-time.
To get the maximum sampling rate (as mentioned in the OP) you have to use either RB or MM mode. In those modes libpruio buffers the samples in memory and the host can access them asynchronously. See example rb_file.c (or triggers.bas) in the libpruio package.
I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.
My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:
for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
*outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}
1 megapixel image array on iPad 2:
format is [min avg max n=number of timer samples] in milliseconds
C:
[14.446 14.632 18.405 n=1000]ms
NEON:
[11.920 12.032 15.336 n=1000]ms
My attempt at a NEON implementation is below:
int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
//Read all r,g,b pixels into 3 registers
uint8x8x4_t rgba = vld4_u8(inPixel32);
//Right-shift r,g,b as appropriate
uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
uint8x8_t b = vshr_n_u8(rgba.val[2], 3);
//Widen b
uint16x8_t r5_g6_b5 = vmovl_u8(b);
//Widen r
uint16x8_t r16 = vmovl_u8(r);
//Left shift into position within 16-bit int
r16 = vshlq_n_u16(r16, 11);
r5_g6_b5 |= r16;
//Widen g
uint16x8_t g16 = vmovl_u8(g);
//Left shift into position within 16-bit int
g16 = vshlq_n_u16(g16, 5);
r5_g6_b5 |= g16;
//Now write back to memory
vst1q_u16(outPixel16, r5_g6_b5);
}
//Do the remainder on normal flt hardware
Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):
_DNConvert_ARGB8888toRGB565:
push {r4, r5, r7, lr}
mov r9, r1
mov.w r12, #0
add r7, sp, #8
cmp r2, #0
mov.w r1, #0
it ne
movne r1, #1
cmp r0, #0
mov.w r3, #0
it ne
movne r3, #1
cmp.w r9, #0
mov.w r4, #0
it ne
movne r4, #1
tst.w r9, #3
bne LBB0_8
ands r1, r3
ands r1, r4
cmp r1, #1
bne LBB0_8
movs r1, #0
lsr.w lr, r9, #2
cmp.w r1, r9, lsr #2
bne LBB0_9
mov r3, r2
mov r5, r0
b LBB0_5
LBB0_4:
movw r1, #65528
add.w r0, lr, #7
movt r1, #32767
ands r1, r0
LBB0_5:
mov.w r12, #1
cmp r1, lr
bhs LBB0_8
rsb r0, r1, r9, lsr #2
mov.w r9, #63488
mov.w lr, #2016
mov.w r12, #1
LBB0_7:
ldr r2, [r5], #4
subs r0, #1
and.w r1, r9, r2, lsl #8
and.w r4, lr, r2, lsr #5
ubfx r2, r2, #19, #5
orr.w r2, r2, r4
orr.w r1, r1, r2
strh r1, [r3], #2
bne LBB0_7
LBB0_8:
mov r0, r12
pop {r4, r5, r7, pc}
LBB0_9:
sub.w r1, lr, #1
movs r3, #32
add.w r3, r3, r1, lsl #2
bic r3, r3, #31
adds r5, r0, r3
movs r3, #16
add.w r1, r3, r1, lsl #1
bic r1, r1, #15
adds r3, r2, r1
movs r1, #0
LBB0_10:
vld4.8 {d16, d17, d18, d19}, [r0]!
adds r1, #8
cmp r1, lr
vshr.u8 d20, d16, #3
vshr.u8 d21, d17, #2
vshr.u8 d16, d18, #3
vmovl.u8 q11, d20
vmovl.u8 q9, d21
vmovl.u8 q8, d16
vshl.i16 q10, q11, #11
vshl.i16 q9, q9, #5
vorr q8, q8, q10
vorr q8, q8, q9
vst1.16 {d16, d17}, [r2]!
Ltmp28:
blo LBB0_10
b LBB0_4
Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!
Here you are, hand-optimized NEON implementation ready for XCode :
/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
* BGRA2RGB565.s
*
* Created by Jake "Alquimista" Lee on 11. 11. 1..
* Copyright 2011 Jake Lee. All rights reserved.
*/
.align 2
.globl _bgra2rgb565_neon
.private_extern _bgra2rgb565_neon
// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);
//ARM
pDst .req r0
pSrc .req r1
count .req r2
//NEON
blu .req d16
grn .req d17
red .req d18
alp .req d19
rg .req red
gb .req blu
_bgra2rgb565_neon:
pld [pSrc]
tst count, #0x7
movne r0, #0
bxne lr
loop:
pld [pSrc, #32]
vld4.8 {blu, grn, red, alp}, [pSrc]!
subs count, count, #8
vshr.u8 red, red, #3
vext.8 rg, grn, red, #5
vshr.u8 grn, grn, #2
vext.8 gb, blu, grn, #3
vst2.8 {gb, rg}, [pDst]!
bgt loop
bx lr
This version will be many times faster than what you suggested :
increased cache hit rate via PLD
conversion to "long" not necessary
fewer instructions within the loop
There is still some room for optimizations though, you could modify the loop so that it converts 16 pixels per iteration instead of 8.
Then you can schedule the instructions to avoid the two stalls completely (which is simply not possible in this 8/iteration version above) and benefit from NEON's dual-issue capability in addition.
I didn't do this because it would make the code hard to understand.
It's important to know what VEXT is supposed to do.
Now it's up to you. :)
I verified this code to be properly compiled under Xcode.
Although I'm pretty sure it works correctly as well, I cannot guarantee this since I don't have the test environment.
In case of malfunctioning, please let me know. I'll correct it accordingly then.
cya
==============================================================================
Well, here is the improved version.
Due to the nature of the VSRI instruction not allowing two operands other than the target, it was not possible to create a more robust one regarding the register assignment.
Please check the image format of your source image. (exact byte order of the elements)
If it's not B, G, R, A, which is the default and native one on iOS, your application will suffer heavily from internal conversions by iOS.
If it's absolutely not possible to change this for whatever the reason, let me know.
I'll write a new version matching it.
PS : I forgot to remove the underscore at the start of the function prototype. Now it's gone.
/*
* BGRA2RGB565.s
*
* Created by Jake "Alquimista" Lee on 11. 11. 1..
* Copyright 2011 Jake Lee. All rights reserved.
*
* Version 1.1
* - bug fix
*
* Version 1.0
* - initial release
*/
.align 2
.globl _bgra2rgb565_neon
.private_extern _bgra2rgb565_neon
// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);
//ARM
pDst .req r0
pSrc .req r1
count .req r2
//NEON
blu .req d16
grn .req d17
red .req d18
alp .req d19
gb .req grn
rg .req red
_bgra2rgb565_neon:
pld [pSrc]
tst count, #0x7
movne r0, #0
bxne lr
.loop:
pld [pSrc, #32]
vld4.8 {blu, grn, red, alp}, [pSrc]!
subs count, count, #8
vsri.8 red, grn, #5
vshl.u8 gb, grn, #3
vsri.8 gb, blu, #3
vst2.8 {gb, rg}, [pDst]!
bgt .loop
bx lr
If you are on iOS or OS X, then you may be delighted to discover vImageConvert_RGBA8888toRGB565() and friends, in Accelerate.framework. This function rounds the 8-bit values to nearest 565 value.
For even better dithering, the quality of which is nearly indistinguishable from 8-bit color, try vImageConvert_AnyToAny():
vImage_CGImageFormat RGBA8888Format =
{
.bitsPerComponent = 8,
.bitsPerPixel = 32,
.bitmapInfo = kCGBitmapByteOrderDefault | kCGImageAlphaNoneSkipLast,
.colorSpace = NULL, // sRGB or substitute your own in
};
vImage_CGImageFormat RGB565Format =
{
.bitsPerComponent = 5,
.bitsPerPixel = 16,
.bitmapInfo = kCGBitmapByteOrder16Little | kCGImageAlphaNone,
.colorSpace = RGBA8888Format.colorSpace,
};
err = vImageConverterRef converter = vImageConverter_CreateWithCGImageFormat(
&RGBA8888Format, &RGB565Format, NULL, kvImageNoFlags, &err );
err = vImageConvert_AnyToAny( converter, &src, &dest, NULL, kvImageNoFlags );
Either of these approaches will be vectorized and multithreaded for best performance.
You might want to use vld4q_u8() instead of vld4_u8() and adjust the rest of your code accordingly. It's hard to tell where the problem might be, but the assembler doesn't look too bad otherwise.
(I'm not familiar with NEON, nor deeply with the memory system of the Ipad2, but this is what we used to do with 88110 pixel-ops, which were an early precursor to today's SIMD extensions)
How big is the memory latency?
Could you hide it by unrolling the inner loop and running the NEON instructions on the "previous" values while the ARM pulls the "next" values from memory? A brief scan of the NEON manual implies you can run ARM and NEON instructions in parallel.
I don't think converting vld4_u8 to vld4q_u8 would lead to any bettering of the performance.
The code seems simple enough. I am not good at ASM and so it would take some time to look into it deeply.
The neon seems simple enough. But I am not quiet sure about r5_g6_b5 |= g16 being used instead of vorrq_u16
Please have a look at the optimization level too. As far as what I heard neon code optimization level goes to a maximum of 1. So the performance may differ when default optimization is being taken into account for both the reference code and neon code, as the level of optimization of reference by DEFAULT may be different.
I doesnt find any area in neon that can better the current code.