DPDK implementation of MPSC ring buffer - c11

While going through the implementation of the DPDK MPSC (multi-produce & single-consumer) Ring Buffer API, i found the code to move the head of the producer for inserting new elements in the Ring buffer. The function is as follows :
static __rte_always_inline unsigned int
__rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
unsigned int n, enum rte_ring_queue_behavior behavior,
uint32_t *old_head, uint32_t *new_head,
uint32_t *free_entries)
{
const uint32_t capacity = r->capacity;
uint32_t cons_tail;
unsigned int max = n;
int success;
*old_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED);
do {
/* Reset n to the initial burst count */
n = max;
/* Ensure the head is read before tail */
__atomic_thread_fence(__ATOMIC_ACQUIRE);
/* load-acquire synchronize with store-release of ht->tail
* in update_tail.
*/
cons_tail = __atomic_load_n(&r->cons.tail,
__ATOMIC_ACQUIRE);
/* The subtraction is done between two unsigned 32bits value
* (the result is always modulo 32 bits even if we have
* *old_head > cons_tail). So 'free_entries' is always between 0
* and capacity (which is < size).
*/
*free_entries = (capacity + cons_tail - *old_head);
/* check that we have enough room in ring */
if (unlikely(n > *free_entries))
n = (behavior == RTE_RING_QUEUE_FIXED) ?
0 : *free_entries;
if (n == 0)
return 0;
*new_head = *old_head + n;
if (is_sp)
r->prod.head = *new_head, success = 1;
else
/* on failure, *old_head is updated */
success = __atomic_compare_exchange_n(&r->prod.head,
old_head, *new_head,
0, __ATOMIC_RELAXED,
__ATOMIC_RELAXED);
} while (unlikely(success == 0));
return n;
}
The load and compare exchange of the producer's head is done using __ATOMIC_RELAXED memory ordering. Isn't this a problem when multiple producers from different threads produce to the queue. Or am I missing something?
https://doc.dpdk.org/guides/prog_guide/ring_lib.html describes the basic mechanism that DPDK uses for implementing the Ring buffer.

Related

Using STM32 FMC HAL driver with parallel DAC

Im trying to generate sinusoidal signal with STM32f767 and DAC8412. DAC have 12 bit data bus and 2 bit address to select one of the four analog outputs. I've configuried FMC in CubeIDE for a SRAM memory with 16 bit data and 2 bit addres. I was able to create buffer with 4096 integer values of sin(). Then i've tried to write them to addres 0x60000000, but it only writes 4 values. After that, program goes to HardFault_Handler().
#define SRAM_BANK_ADDR ((uint32_t)0x60000000)
#define RESOLUTION_T ((uint32_t)0x1000)
#define RESOLUTION_Y ((uint32_t)0x1000)
uint32_t aTxBuffer[RESOLUTION_T];
uint32_t address = SRAM_BANK_ADDR;
Thats how i try to send data to DAC:
for (uint32_t i = 0; i<RESOLUTION_T; i++ )
{
*(__IO uint32_t*) address = aTxBuffer[i];
}
Thats how i fill buffer:
static void Fill_Buffer(uint32_t *pBuffer, uint32_t res_T, uint32_t res_Y)
{
uint32_t tmpIndex = 0;
double sinVal;
/* Put in global buffer different values */
for (tmpIndex = 0; tmpIndex < res_T; tmpIndex++ )
{
sinVal = round((sin(M_TWOPI*tmpIndex/res_T)+1)*res_Y/2);
pBuffer[tmpIndex] = sinVal;
}
}

How does copyout() in xv6 avoid race condition in the page table?

// Copy from kernel to user.
// Copy len bytes from src to virtual address dstva in a given page table.
// Return 0 on success, -1 on error.
int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len)
{
uint64 n, va0, pa0;
while(len > 0){
va0 = PGROUNDDOWN(dstva);
pa0 = walkaddr(pagetable, va0);
if(pa0 == 0)
return -1;
n = PGSIZE - (dstva - va0);
if(n > len)
n = len;
memmove((void *)(pa0 + (dstva - va0)), src, n);
len -= n;
src += n;
dstva = va0 + PGSIZE;
}
return 0;
}
The function used in xv6 kernel for copying out data from the kernel to user-space process accepts a pagetable as argument and uses that pagetable to access the physical address behind that user-space address. However, neither copyout() nor walkaddr() performs any locking operation on the page table. How are race conditions avoided in this case? Is it possible for the page table to change after getting the physical address and before the actual reading and cause issues? And how is it avoided in xv6?
If I am not wrong, the copyout() is invoked by the kernel in the
piperead() and pipewrite() located in pipewrite.c function. Prior to calling this
function the calling process's spinlock is already held.

Studying the Quicksort algorithm

I study the Quicksort algorithm and created a sample program like this :
// Java program for implementation of QuickSort
class QuickSort
{
/* This function takes last element as pivot,
places the pivot element at its correct
position in sorted array, and places all
smaller (smaller than pivot) to left of
pivot and all greater elements to right
of pivot */
int partition(int arr[], int low, int high)
{
int pivot = arr[high];
int i = (low-1); // index of smaller element
for (int j=low; j<high; j++)
{
// If current element is smaller than or
// equal to pivot
if (arr[j] <= pivot)
{
i++;
// swap arr[i] and arr[j]
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
// swap arr[i+1] and arr[high] (or pivot)
int temp = arr[i+1];
arr[i+1] = arr[high];
arr[high] = temp;
return i+1;
}
/* The main function that implements QuickSort()
arr[] --> Array to be sorted,
low --> Starting index,
high --> Ending index */
void sort(int arr[], int low, int high)
{
if (low < high)
{
/* pi is partitioning index, arr[pi] is
now at right place */
int pi = partition(arr, low, high);
// Recursively sort elements before
// partition and after partition
sort(arr, low, pi-1);
sort(arr, pi+1, high);
}
}
/* A utility function to print array of size n */
static void printArray(int arr[])
{
int n = arr.length;
for (int i=0; i<n; ++i)
System.out.print(arr[i]+" ");
System.out.println();
}
// Driver program
public static void main(String args[])
{
int arr[] = {10, 7, 8, 9, 1, 5};
int n = arr.length;
QuickSort ob = new QuickSort();
ob.sort(arr, 0, n-1);
System.out.println("sorted array");
printArray(arr);
}
}
/*This code is contributed by Rajat Mishra */
Precisely in this part :
int partition(int arr[], int low, int high)
{
int pivot = arr[high];
int i = (low-1); // index of smaller element
for (int j=low; j<high; j++)
would you have an idea why the code says: int i = (low-1) ? The range of i will not become -1 with this instruction? I mean it was initialized previously to 0? Then do think is it possible such instruction? Or what as you it has to understand?
Best regard,
Thanks,
Intelego.
Yes of course,
First, "i" is initialized to 0
QuickSort ob = new QuickSort();
ob.sort(arr, 0, n-1);
And then,
high --> Ending index */
void sort(int arr[], int low, int high)
{
if (low < high)
{
/* pi is partitioning index, arr[pi] is
now at right place */
int pi = partition(arr, low, high);
Low = 0 until here,
and the code gives "i" the value of -1.
from where "i" wanted to know if it is possible "i" could be -1 ?
In this case what could represent the index -1 ?
Thanks very much,
Intelegoit

How to use Backup SRAM as EEPROM in STM32F4

There are two ways of emulating EEPROM on the STM32F4:
On-chip 4 Kbytes backup SRAM
On-chip Flash, with specific software algorithm
The second option is described here: AN3969.
But google, unfortunately, hasn't been able to provide information on how to use the first option - using the 4Kb of backup SRAM as EEPROM?..
Can anyone help on the topic?
must do these:
Enable the PWR clock
RCC_APB1PeriphClockCmd(RCC_APB1Periph_PWR, ENABLE);
Enable access to the backup domain
PWR_BackupAccessCmd(ENABLE);
Enable backup SRAM Clock
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
Enable the Backup SRAM low power Regulator to retain it's content in VBAT mode
PWR_BackupRegulatorCmd(ENABLE);
and you can write/read datas to sram (these codes from BKP_Domain codes in STM32F4xx_DSP_StdPeriph_Lib) (in my mcu stm32f417 BKPSRAM_BASE=0x40024000)
// Write to Backup SRAM with 32-Bit Data
for (i = 0x0; i < 0x100; i += 4) {
*(__IO uint32_t *) (BKPSRAM_BASE + i) = i;
}
// Check the written Data
for (i = 0x0; i < 0x100; i += 4) {
if ((*(__IO uint32_t *) (BKPSRAM_BASE + i)) != i){
errorindex++;
}
}
then if you want:
// Wait until the Backup SRAM low power Regulator is ready
while(PWR_GetFlagStatus(PWR_FLAG_BRR) == RESET)
{}
you can find these functions in STM32F4xx_DSP_StdPeriph_Lib.
after reading through the Reference Manual for stm32f4 and the stm32f405xx/stm32f407xx datasheet I agree that it isn't clear how to actually use the backup sram (or where it is located). Here is what I found. Both the RTC registers and backup SRAM contain some amount of storage that is maintained as long as you have battery power. The RTC contains 20 registers (80 bytes) and the backup sram (which is its own peripheral on AHB1 and located within the register address region) contains 0x1000 (4096 bytes). Neither are enabled by default.
in DM00037051 (stm32f405xx/stm32f407xx datasheet, p29):
The 4-Kbyte backup SRAM is an EEPROM-like memory area. It can be used to store
data which need to be retained in VBAT and standby mode. This memory area is
disabled by default to minimize power consumption (see Section 2.2.19:
Low-power modes). It can be enabled by software.
The backup registers are 32-bit registers used to store 80 bytes of user
application data when VDD power is not present. Backup registers are not reset
by a system, a power reset, or when the device wakes up from the Standby mode
(see Section 2.2.19: Low-power modes).
on page 71 of datasheet and p65 of the reference manual
AHB1 | 0x4002 4000 - 0x4002 4FFF | BKPSRAM
and page 73 of the datatasheet and p67 of the reference manual
APB1 | 0x4000 2800 - 0x4000 2BFF | RTC & BKP Registers
Page 118-119 of the reference manual contains information on enabling the backup SRAM and RTC registers.
NOTE: if you are already using the RTC in the backup domain and only need to store <= 80 bytes, then you are better off using the RTC backup registers because enabling the backup SRAM will basically double the current consumption (see table 25 in the stm32f405/7 datasheet).
here are my write and read functions for both backup SRAM and backup RTC registers
int8_t write_to_backup_sram( uint8_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 0x1000;
uint8_t* base_addr = (uint8_t *) BKPSRAM_BASE;
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
}
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
/* disable backup domain write protection */
RCC_APB1PeriphClockCmd(RCC_APB1Periph_PWR, ENABLE); // set RCC->APB1ENR.pwren
PWR_BackupAccessCmd(ENABLE); // set PWR->CR.dbp = 1;
/** enable the backup regulator (used to maintain the backup SRAM content in
* standby and Vbat modes). NOTE : this bit is not reset when the device
* wakes up from standby, system reset or power reset. You can check that
* the backup regulator is ready on PWR->CSR.brr, see rm p144 */
PWR_BackupRegulatorCmd(ENABLE); // set PWR->CSR.bre = 1;
for( i = 0; i < bytes; i++ ) {
*(base_addr + offset + i) = *(data + i);
}
PWR_BackupAccessCmd(DISABLE); // reset PWR->CR.dbp = 0;
return 0;
}
int8_t read_from_backup_sram( uint8_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 0x1000;
uint8_t* base_addr = (uint8_t *) BKPSRAM_BASE;
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
}
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
for( i = 0; i < bytes; i++ ) {
*(data + i) = *(base_addr + offset + i);
}
return 0;
}
int8_t write_to_backup_rtc( uint32_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 80;
volatile uint32_t* base_addr = &(RTC->BKP0R);
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
} else if( offset % 4 || bytes % 4 ) {
/* ERROR: data start or num bytes are not word aligned */
return -2;
} else {
bytes >>= 2; /* divide by 4 because writing words */
}
/* disable backup domain write protection */
RCC_APB1PeriphClockCmd(RCC_APB1Periph_PWR, ENABLE); // set RCC->APB1ENR.pwren
PWR_BackupAccessCmd(ENABLE); // set PWR->CR.dbp = 1;
for( i = 0; i < bytes; i++ ) {
*(base_addr + offset + i) = *(data + i);
}
PWR_BackupAccessCmd(DISABLE); // reset PWR->CR.dbp = 0;
// consider also disabling the power peripherial?
return 0;
}
int8_t read_from_backup_rtc( uint32_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 80;
volatile uint32_t* base_addr = &(RTC->BKP0R);
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
} else if( offset % 4 || bytes % 4 ) {
/* ERROR: data start or num bytes are not word aligned */
return -2;
} else {
bytes >>= 2; /* divide by 4 because writing words */
}
/* read should be 32 bit aligned */
for( i = 0; i < bytes; i++ ) {
*(data + i) = *(base_addr + offset + i);
}
return 0;
}
I had to jump from main program to bootloader on user request.
So I put some 'magic number' into BKPSRAM in main program, do CPU soft reset.
Bootloader always starts first.
It checks for 'magic number' if it is present, it executes, else starts main program
when using HAL this is how to jump to bootloader:
__HAL_RCC_PWR_CLK_ENABLE();
HAL_PWR_EnableBkUpAccess();
__BKPSRAM_CLK_ENABLE();
*(__IO uint8_t *)0x40024000 = 42;//magic number
HAL_NVIC_SystemReset();
inside bootloader to read magic number it is enough to enable backup sram clock only (bootloader uses StdPeriphDriver).
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
extRequest = *(__IO uint8_t *)0x40024000;
if(extRequest == 42)
//run bootloader
cpu is stm32f407
Here is the example of HAL library to use backup SRAM.
#define WRITE_READ_ADDR 0x01 //offset value.you can change according to your application
uint32_t write_arr = 0xA5A5A5A6;
uint32_t read_arr;
int main()
{
enable_backup_sram();
writeBkpSram(write_arr);
while(1)
{
read_arr = readBkpSram();
}
}
void enable_backup_sram(void)
{
/*DBP : Enable access to Backup domain */
HAL_PWR_EnableBkUpAccess();
/*PWREN : Enable backup domain access */
__HAL_RCC_PWR_CLK_ENABLE();
/*BRE : Enable backup regulator
BRR : Wait for backup regulator to stabilize */
HAL_PWREx_EnableBkUpReg();
/*DBP : Disable access to Backup domain */
HAL_PWR_DisableBkUpAccess();
}
void writeBkpSram(uint32_t l_data)
{
/* Enable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_ENABLE();
/* Pointer write on specific location of backup SRAM */
(uint32_t *) (BKPSRAM_BASE + WRITE_READ_ADDR) = l_data;
/* Disable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_DISABLE();
}
uint32_t readBkpSram(void)
{
uint32_t i_retval;
/* Enable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_ENABLE();
/* Pointer write from specific location of backup SRAM */
i_retval = *(uint32_t*) (BKPSRAM_BASE + WRITE_READ_ADDR);
/* Disable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_DISABLE();
return i_retval;
}
I'm currently using the an STM32F2xx microcontroller. According to the datasheet:
The 4-Kbyte backup SRAM is an EEPROM-like area.
To retain the content of the RTC backup registers … when VDD is turned off, VBAT pin can be connected to an optional standby voltage supplied by a battery or by another source.
A supercap, for example, would be required to maintain the contents of the backup registers while the microcontroller is powered off.
Also, according to the document:
After reset, the backup domain (… backup SRAM) is protected against possible unwanted write accesses. To enable access to the backup domain, proceed as follows …
It gives you instructions on how to gain access to the backup domain by directly writing to the certain peripheral register. If you have access to the STM32F4xx library, you can call something like this (note: I'm using the STM32F2xx library):
PWR_BackupAccessCmd(ENABLE);
Note: There's is more to it than simply calling the above function, such as enabling the backup SRAM interface clock. Consult the STM32F4 series documentation.
There is a lot of documentation embedded in the library source that is invaluable and if it's available should be read.
On the STM32F2 series microcontroller, SRAM is located at the following memory address range:
0x40024000 - 0x40024FFF
And can be written to somewhere at location, for example, as follows:
#define VAR_LOC ((volatile uint8_t *)(0x40024000))
volatile uint8_t *pVar = VAR_LOC;
*pVar = 5;
Useable example
In header:
//------------------------------------
typedef struct
{
uint32_t isDefault; //must by 0x12345678
uint32_t LastTestNumber;
uint32_t LastUserNumber;
uint32_t LastModeTest;
uint32_t calibv;
uint32_t calibc;
uint32_t WorkTime;
int32_t RTCCalib;
uint32_t LCDContrast;
} sBKPSRAM;
extern sBKPSRAM *BKPSRAM;// = (sSDRAM *)SDRAM_BANK_ADDR;
//------------------------------------
In code head
define as data:
sBKPSRAM *BKPSRAM = (sBKPSRAM *)BKPSRAM_BASE;
In Init:
void main(void)
{
(....)
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
PWR_BackupAccessCmd(ENABLE);
PWR_BackupRegulatorCmd(ENABLE);
ifDefault();
(....)
}
In procedure:
//-------------------------------------------------
void ifDefault(void)
{
if (BKPSRAM->LastModeTest!=0x12345678)
{
printf("BKPSRAM to default\r\n");
memset(BKPSRAM,0,sizeof(sBKPSRAM));
BKPSRAM->calibv =66920;
BKPSRAM->calibc =79230;
BKPSRAM->RTCCalib =1;
BKPSRAM->LCDContrast =2;
BKPSRAM->LastModeTest =0x12345678;
}
}
//-------------------------------------------------
HAL Configuration for STM32H7 to access backup SRAM:
#define BKP_RAM (*(__IO uint32_t *) (D3_BKPSRAM_BASE)) //Start address: 0x38800000
Main() {
__HAL_RCC_BKPRAM_CLK_ENABLE();
HAL_PWREx_EnableBkUpReg();
BKP_RAM = 0xA5AA5A55;
}
In addition to that, you need to add a below line in systemInit() to enable write-through access to Backup SRAM.
SCB->CACR |= 1<<2;

What about this OpenCL kernel is causing the error CL_INVALID_COMMAND_QUEUE

I'm having a problem implementing a Feed-Forward MultiLayer Perceptron, with back-prop learning in OpenCL in Java, using JOCL. Here is the kernel code for the calculation phase:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void Neuron(__global const double *inputPatterns,
__global double *weights,
__global const int *numInputs,
__global const int *activation,
__global const double *bias,
__global const int *usingBias,
__global double *values,
__global const int *maxNumFloats,
__global const int *patternIndex,
__global const int *inputPatternSize,
__global const int *indexOffset,
__global const int *isInputNeuron,
__global const int *inputs)
{
int gid = get_global_id(0);
double sum = 0.0;
for(int i = 0; i < numInputs[gid+indexOffset[0]]; i++)
{
sum += values[inputs[(gid+indexOffset[0]) * maxNumFloats[0] + i]] *
weights[(gid+indexOffset[0]) * maxNumFloats[0] + i];
}
if(usingBias[gid+indexOffset[0]])
sum += bias[gid+indexOffset[0]];
if(isInputNeuron[gid+indexOffset[0]])
sum += inputPatterns[gid+indexOffset[0]+(patternIndex[0] * inputPatternSize[0])];
if(activation[gid+indexOffset[0]] == 1)
sum = 1.0 / (1.0 + exp(-sum));
values[gid + indexOffset[0]] = sum;
}
Basically, I run this kernel for each layer in the network. For the first layer, there are no "inputs", so the loop does not execute. As the first layer is an input node layer however, it does add the relevant value from the input pattern. This executes fine, and I can read back the values at this point.
When I try and run the SECOND layer however (which does have inputs, every node from the first layer), a call to clFinish() returns the error CL_INVALID_COMMAND_QUEUE. Sometimes this error is coupled with a driver crash and recovery. I have read around (here for example) that this might be a problem with TDR timeouts, and have made an attempt to raise the limit but unsure if this is making any difference.
I'm going through the calls to clSetKernelArg() to check for anything stupid, but can anyone spot anything obviously off in the code? It would seem that the error is introduced in the second layer due to the inclusion of the for loop... I can clarify any of the parameters if its needed but it seemed a bit overkill for an initial post.
Also, I'm fully aware this code will probably be an affront to competent coders everywhere, but feel free to flame :P
EDIT: Host code:
//Calc
for(int k = 0; k < GPUTickList.length; k++)
{
clFlush(clCommandQueue);
clFinish(clCommandQueue);
//If input nodes
if(k == 0)
//Set index offset to 0
GPUMapIndexOffset.asIntBuffer().put(0, 0);
else
//Update index offset
GPUMapIndexOffset.asIntBuffer().put(0,
GPUMapIndexOffset.asIntBuffer().get(0) + GPUTickList[k-1]);
//Write index offset to GPU buffer
ret = clEnqueueWriteBuffer(clCommandQueue, memObjects[12], CL_TRUE, 0,
Sizeof.cl_int, Pointer.to(GPUMapIndexOffset.position(0)), 0, null, null);
//Set work size (width of layer)
global_work_size[0] = GPUTickList[k];
ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_iterate, 1,
global_work_offset, global_work_size, local_work_size,
0, null, null);
}
EDIT 2: I've uploaded the full code to pastebin.
Solved. Fixed the error by making everything indexed with [0] a straight kernel parameter, rather than a buffer. Clearly the hardware doesn't like lots of stuff accessing one particular element of a buffer at once.
I'm not sure about what you have above the loop.. do you use the queue other than in this loop? Below is something you may want to try out.
//flush + finish if you need to before the loop, otherwise remove these lines
clFlush(clCommandQueue);
clFinish(clCommandQueue);
cl_event latestEvent;
//Calc
for(int k = 0; k < GPUTickList.length; k++)
{
//If input nodes
if(k == 0)
//Set index offset to 0
GPUMapIndexOffset.asIntBuffer().put(0, 0);
else
//Update index offset
GPUMapIndexOffset.asIntBuffer().put(0,
GPUMapIndexOffset.asIntBuffer().get(0) + GPUTickList[k-1]);
//Write index offset to GPU buffer
ret = clEnqueueWriteBuffer(clCommandQueue, memObjects[12], CL_TRUE, 0,
Sizeof.cl_int, Pointer.to(GPUMapIndexOffset.position(0)), 0, null, null);
//Set work size (width of layer)
global_work_size[0] = GPUTickList[k];
ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_iterate, 1,
global_work_offset, global_work_size, local_work_size,
0, null, &latestEvent);
clWaitForEvents(1, &latestEvent);
}