PMC to count if software prefetch hit L1 cache

PMC to count if software prefetch hit L1 cache - x86-64

I am trying to find a PMC (Performance Monitoring Counter) that will display the amount of times that a prefetcht0 instruction hits L1 dcache (or misses).
icelake-client: Intel(R) Core(TM) i7-1065G7 CPU # 1.30GHz
I am trying to make this fine grain i.e (note should include lfence around prefetcht0)
xorl %ecx, %ecx
rdpmc
movl %eax, %edi
prefetcht0 (%rsi)
rdpmc
testl %eax, %edi
// jump depending on if it was a miss or not
The goal is to check if a prefetch hit L1. If didn't execute some code that is ready, otherwise proceed.
It seems that it will have to be a miss event just based on what is available.
I have tried a few events from libpfm4 and intel manual with no luck:
L1-DCACHE-LOAD-MISSES, emask=0x00, umask=0x10000
L1D.REPLACEMENT, emask=0x51, umask=0x1
L2_RQSTS.SWPF_HIT, emask=0x24, umask=0xc8
L2_RQSTS.SWPF_MISS, emask=0x24, umask=0x28
LOAD_HIT_PREFETCH.SWPF, emask=0x01, umask=0x4c (this very misleadingly is non-sw prefetch hits)
L1D.REPLACEMENT and L1-DCACHE-LOAD-MISSES kind of works, it works if I delay the rdpmc but if they are one after another it seems unreliable at best. The other ones are complete busts.
Questions:
Should any of these work for detecting if prefetches hit L1 dcache? (i.e my testing is bad)
If not. Whats events could be used to detect if a prefetch hit L1 dcache?
Edit: MEM_LOAD_RETIRED.L1_HIT does not appear to work for software prefetch.
Here is the code I am using to do test:
#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#define HIT 0
#define MISS 1
#define TODO MISS
#define PAGE_SIZE 4096
// to force hit make TSIZE low
#define TSIZE 10000
#define err_assert(cond) \
if (__builtin_expect(!(cond), 0)) { \
fprintf(stderr, "%d:%d: %s\n", __LINE__, errno, strerror(errno)); \
exit(-1); \
}
uint64_t
get_addr() {
uint8_t * addr =
(uint8_t *)mmap(NULL, TSIZE * PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
err_assert(addr != NULL);
for (uint32_t i = 0; i < TSIZE; ++i) {
addr[i * PAGE_SIZE + (PAGE_SIZE - 1)] = 0;
#if TODO == HIT
addr[i * PAGE_SIZE] = 0;
#endif
}
return uint64_t(addr);
}
int
perf_event_open(struct perf_event_attr * hw_event,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
return ret;
}
void
init_perf_event_struct(struct perf_event_attr * pe,
const uint32_t type,
const uint64_t ev_config,
int lead) {
__builtin_memset(pe, 0, sizeof(struct perf_event_attr));
pe->type = type;
pe->size = sizeof(struct perf_event_attr);
pe->config = ev_config;
pe->disabled = !!lead;
pe->exclude_kernel = 1;
pe->exclude_hv = 1;
}
/* Fixed Counters */
static constexpr uint32_t core_instruction_ev = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;
static constexpr uint32_t core_cycles_ev = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;
static constexpr uint32_t ref_cycles_ev = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;
/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
int
init_perf_tracking() {
struct perf_event_attr pe;
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_instruction_ev, 1);
int leadfd = perf_event_open(&pe, 0, -1, -1, 0);
err_assert(leadfd >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, ref_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, mem_load_retired_l1_hit, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
return leadfd;
}
void
start_perf_tracking(int leadfd) {
ioctl(leadfd, PERF_EVENT_IOC_RESET, 0);
ioctl(leadfd, PERF_EVENT_IOC_ENABLE, 0);
}
#define _V_TO_STR(X) #X
#define V_TO_STR(X) _V_TO_STR(X)
//#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]), %%eax\n\t"
#endif
int
main() {
int fd = init_perf_tracking();
start_perf_tracking(fd);
uint64_t addr = get_addr();
uint32_t prefetch_miss, cycles_to_detect;
asm volatile(
"lfence\n\t"
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[prefetch_miss]\n\t"
"lfence\n\t"
DO_MEMORY_OP(prefetch_addr)
"lfence\n\t"
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"subl %[prefetch_miss], %%eax\n\t"
"movl %%eax, %[prefetch_miss]\n\t"
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"subl %[cycles_to_detect], %%eax\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
"lfence\n\t"
: [ prefetch_miss ] "=&r"(prefetch_miss),
[ cycles_to_detect ] "=&r"(cycles_to_detect)
: [ prefetch_addr ] "r"(addr), [ core_cycles_idx ] "i"(core_cycles_idx)
: "eax", "edx", "ecx");
fprintf(stderr, "Hit : %d\n", prefetch_miss);
fprintf(stderr, "Cycles : %d\n", cycles_to_detect);
}
if I define DO_PREFETCH the results for MEM_LOAD_RETIRED.L1_HIT are always 1 (always appears to get a hit). If I comment out DO_PREFETCH the results correspond with what I would expect (when the address is clearly not in cache reports miss, when it clearly is reports hit).
With DO_PREFETCH:
g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit : 1
Cycles : 554
and without DO_PREFETCH
g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit : 0
Cycles : 888
With L2_RQSTS.SWPF_HIT and L2_RQSTS.SWPF_MISS was able to get it to work. Big thanks to Hadi Brais. Worth noting that the reason L1D_PEND_MISS.PENDING didn't work might be related to Icelake. Hadi Brais reported getting it to work for predicting L1D cached misses on Haswell.
In the interest of trying to determine why L1_PEND_MISS.PENDING and MEM_LOAD_RETIRED.L1_HIT do not work posted the exact code I'm using for testing them:
#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#define HIT 0
#define MISS 1
#define TODO MISS
#define PAGE_SIZE 4096
#define TSIZE 1000
#define err_assert(cond) \
if (__builtin_expect(!(cond), 0)) { \
fprintf(stderr, "%d:%d: %s\n", __LINE__, errno, strerror(errno)); \
exit(-1); \
}
uint64_t
get_addr() {
uint8_t * addr =
(uint8_t *)mmap(NULL, TSIZE * PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
err_assert(addr != NULL);
__builtin_memset(addr, -1, TSIZE * PAGE_SIZE);
return uint64_t(addr);
}
int
perf_event_open(struct perf_event_attr * hw_event,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
return ret;
}
void
init_perf_event_struct(struct perf_event_attr * pe,
const uint32_t type,
const uint64_t ev_config,
int lead) {
__builtin_memset(pe, 0, sizeof(struct perf_event_attr));
pe->type = type;
pe->size = sizeof(struct perf_event_attr);
pe->config = ev_config;
pe->disabled = !!lead;
pe->exclude_kernel = 1;
pe->exclude_hv = 1;
}
/* Fixed Counters */
static constexpr uint32_t core_instruction_ev = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;
static constexpr uint32_t core_cycles_ev = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;
static constexpr uint32_t ref_cycles_ev = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;
/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
static constexpr uint32_t l1d_pending = 0x0148;
static constexpr uint32_t swpf_hit = 0xc824;
static constexpr uint32_t swpf_miss = 0x2824;
static constexpr uint32_t ev0 = l1d_pending;
#define NEVENTS 1
#if NEVENTS > 1
static constexpr uint32_t ev1 = swpf_miss;
#endif
int
init_perf_tracking() {
struct perf_event_attr pe;
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_instruction_ev, 1);
int leadfd = perf_event_open(&pe, 0, -1, -1, 0);
err_assert(leadfd >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, ref_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, ev0, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
#if NEVENTS > 1
init_perf_event_struct(&pe, PERF_TYPE_RAW, ev1, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
#endif
return leadfd;
}
void
start_perf_tracking(int leadfd) {
ioctl(leadfd, PERF_EVENT_IOC_RESET, 0);
ioctl(leadfd, PERF_EVENT_IOC_ENABLE, 0);
}
#define _V_TO_STR(X) #X
#define V_TO_STR(X) _V_TO_STR(X)
//#define LFENCE
#ifdef LFENCE
#define SERIALIZER() "lfence\n\t"
#else
#define SERIALIZER() \
"xorl %%ecx, %%ecx\n\t" \
"xorl %%eax, %%eax\n\t" \
"cpuid\n\t"
#endif
#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]), %%eax\n\t"
#endif
int
main() {
int fd = init_perf_tracking();
start_perf_tracking(fd);
uint64_t addr = get_addr();
// to ensure page in TLB
*((volatile uint64_t *)(addr + (PAGE_SIZE - 8))) = 0;
#if TODO == HIT
// loading from 0 offset to check cache miss / hit
*((volatile uint64_t *)addr) = 0;
#endif
uint32_t ecount0 = 0, ecount1 = 0, cycles_to_detect = 0;
asm volatile(
SERIALIZER()
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[ecount0]\n\t"
#if NEVENTS > 1
"movl $1, %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[ecount1]\n\t"
#endif
SERIALIZER()
DO_MEMORY_OP(prefetch_addr)
SERIALIZER()
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"subl %[ecount0], %%eax\n\t"
"movl %%eax, %[ecount0]\n\t"
#if NEVENTS > 1
"movl $1, %%ecx\n\t"
"rdpmc\n\t"
"subl %[ecount1], %%eax\n\t"
"movl %%eax, %[ecount1]\n\t"
#endif
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"subl %[cycles_to_detect], %%eax\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
SERIALIZER()
: [ ecount0 ] "=&r"(ecount0),
#if NEVENTS > 1
[ ecount1 ] "=&r"(ecount1),
#endif
[ cycles_to_detect ] "=&r"(cycles_to_detect)
: [ prefetch_addr ] "r"(addr), [ core_cycles_idx ] "i"(core_cycles_idx)
: "eax", "edx", "ecx");
fprintf(stderr, "E0 : %d\n", ecount0);
fprintf(stderr, "E1 : %d\n", ecount1);
fprintf(stderr, "Cycles : %d\n", cycles_to_detect);
}

The rdpmc is not ordered with the events that may occur before it or after it in program order. A fully serializing instruction, such as cpuid, is required to obtain the desired ordering guarantees with respect to prefetcht0. The code should be as follows:
xor %eax, %eax # CPUID leaf eax=0 should be fast. Doing this before each CPUID might be a good idea, but omitted for clarity
cpuid
xorl %ecx, %ecx
rdpmc
movl %eax, %edi # save RDPMC result before CPUID overwrites EAX..EDX
cpuid
prefetcht0 (%rsi)
cpuid
xorl %ecx, %ecx
rdpmc
testl %eax, %edi # CPUID doesn't affect FLAGS
cpuid
Each of the rdpmc instructions are sandwiched between cpuid instructions. This ensures that any events and only these events that occur between the two rdpmc instructions are counted.
The prefetch operation of the prefetcht0 instruction may either be ignored or performed. If it was performed, it may either hit in a cache line that is in a valid state in the L1D or not. These are the cases that have to be considered.
The sum of L2_RQSTS.SWPF_HIT and L2_RQSTS.SWPF_MISS cannot be used to count or derive the number of prefetcht0 hits in the L1D, but their sum can be subtracted from SW_PREFETCH_ACCESS.T0 to get an upper bound on the number of prefetcht0 hits in the L1D. With the properly serialized sequence shown above, I think the only case where a non-ignored prefetcht0 doesn't hit in the L1D and is not counted by the sum SWPF_HIT+SWPF_MISS is if the software prefetch operation hits in an LFB allocated for a hardware prefetch.
L1-DCACHE-LOAD-MISSES is just another name for L1D.REPLACEMENT. The event code and umask you've shown for L1-DCACHE-LOAD-MISSES is incorrect. The L1D.REPLACEMENT event only occurs if the prefetch operation misses in the L1D (which causes a request to be sent to the L2) and causes a valid line in the L1D to be replaced. Usually most fills cause a replacement, but the event still cannot be used to distinguish between a prefetcht0 that hits in the L1D, a prefetcht0 that hits in an LFB allocated for a hardware prefetch, and an ignored prefetcht0.
The event LOAD_HIT_PREFETCH.SWPF occurs when a demand load hits in an LFB allocated for a software prefetch. This is obviously not useful here.
The event L1D_PEND_MISS.PENDING (event=0x48, umask=0x01) should work. According to the documentation, this event increments the counter by the number of pending L1D misses every cycle. I think it works for demand loads and prefetches. This is really an approximation, so it may count even if there are zero pending L1D misses. But I think it can still be used to determine with very high confidence whether a single prefetcht0 missed in the L1D by following these steps:
First, add the line uint64_t value = *(volatile uint64_t*)addr; just before the inline assembly. This is to increase the probability to near 100% that the line to be prefetched is in the L1D.
Second, measure the minimum increment of L1D_PEND_MISS.PENDING for a prefetcht0 that is very highly likely to hit in the L1D.
Run the experiment many times to build high confidence that the minimum increment is highly stable to the extent the the same exact value is observed in almost every run.
Comment out the line added in the first step so that the prefetcht0 misses and check that the event count change is always or almost always larger than the minimum increment measured previously.
So far, I've only been concerned with making a distinction between a prefetch that hits in the L1D and a non-ignored prefetch that misses in both the L1D and the LFBs. Now I'll consider the rest of the cases:
If the prefetch results in a page fault or if the memory type of the target cache line is WC or UC, the prefetch is ignored. I don't know whether the L1D_PEND_MISS.PENDING event can be used to distinguish between a hit and this case. You can run experiment where the target address of the prefetch instruction to is in a virtual page with no valid mapping or mapped to a kernel page. Check if the change in the event count is unique with high probability.
If no LFBs are available, the prefetch is ignored. This case can be eliminated by switching off the sibling logical core and using cpuid instead of lfence before the first rdpmc.
If the prefetch hits in an LFB allocated for an RFO, ItoM, or a hardware prefetch request, then the prefetch is effectively redundant. For all of these types of requests, the change in the L1D_PEND_MISS.PENDING count may or not be distinguishable from a hit in the L1D. This case can be eliminated by using cpuid instead of lfence before the first rdpmc and turning of the two L1D hardware prefetchers.
I don't think a prefetch to a prefetchable memory type can hit in a WCB because changing the memory type of a location is a fully serializing operation, so this case is not a problem.
One obvious advantage of using L1D_PEND_MISS.PENDING instead of the sum SWPF_HIT+SWPF_MISS is the smaller number of events. Another advantage is that L1D_PEND_MISS.PENDING is supported on some of the earlier the microarchitectures. Also, as discussed above, it can be more powerful. It works on my Haswell with a threshold of 69-70 cycles.
If the L1D_PEND_MISS.PENDING event changes in different cases are not distinguishable, then the sum SWPF_HIT+SWPF_MISS can be used. These two events occur at the L2 and so they only tell you whether the prefetch missed in the L1D and a request is sent and accepted by the L2. If the request is rejected or hit in the L2's SQ, none of the two events may occur. In addition, all of the aforementioned cases will not be distinguishable from an L1D hit.
For normal demand loads, you can use MEM_LOAD_RETIRED.L1_HIT. If the load hits in the L1D, a single L1_HIT occurs. Otherwise, in any other case, no L1_HIT events occur, assuming that no other instruction between the two rdpmcs, such as cpuid, can generate L1_HIT events. You'll have to verify that cpuid doesn't generate L1_HIT events. Don't forget to count only user-mode events because an interrupt can occur between any two instructions and the interrupt handler may generate one or more L1_HIT events in kernel mode. While it's very unlikely, if you want to be 100% sure, check also whether the occurrence of an interrupt itself generates L1_HIT events.

Related

RPi Wiringpi fails to read i2c correctly

I have an AHT21 that communicates over i2c: I send 3 bytes and get back 6. The arduino sketch works but the RPi does not. What is wrong with WiringPi i2c syntax?
I want to convert this arduino sketch to RPi c++ program using WiringPi.
This works:
#include <Wire.h>
#define AHT21 0x38
void setup() {
// put your setup code here, to run once:
Wire.begin(); // the SDA and SCL
Serial.begin(9600);
uint8_t rawData[7] = {0,0,0,0,0,0,0};
Wire.beginTransmission(AHT21);
Wire.write(0xAC); //send measurement command, start measurement
Wire.write(0x33); //send measurement control
Wire.write(0x00); //send measurement NOP control
Wire.endTransmission();
delay(100);
Wire.requestFrom(AHT21, 6);
for (uint8_t i = 0; i < 6; i++)
{
rawData[i] = Wire.read();
Serial.print(i);Serial.print(": ");
Serial.println(rawData[i]);
}
}
void loop() {}
Gives:
0: 28
1: 106
2: 90
3: 117
4: 126
5: 70
This RPI code fails giving the status byte over and over:
#include <wiringPi.h>
#include <wiringPiI2C.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#define Address 0x38
int main (int argc, char **argv)
{
int fd = wiringPiI2CSetup(Address);
uint8_t rawData[7] = {0,0,0,0,0,0,0};
wiringPiI2CWrite(fd,0xAC); //send measurement command, start measurement
wiringPiI2CWrite(fd,0x33); //send measurement control
wiringPiI2CWrite(fd,0x00); //send measurement NOP control
delay(100);
for (uint8_t i = 0; i < 6; i++)
{
rawData[i] = wiringPiI2CRead(fd);
printf("%d: %d\n",i,rawData[i]);
}
}
Gives:
./aht21
0: 28
1: 28
2: 28
3: 28
4: 28
5: 28

I abandoned WiringPi and went with ioctl and i2c-dev.h. Works fine:
//gcc -g -Wall -Wextra -pedantic -std=c11 -D_DEFAULT_SOURCE -D_BSD_SOURCE -o aht21 aht21.c
#include <stdio.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h> // read/write usleep
#include <stdlib.h> // exit function
#include <inttypes.h> // uint8_t, etc
#include <linux/i2c-dev.h> // I2C bus definitions
int main (int argc, char **argv)
{
float ahtTemp, ahtHum;
uint8_t rawData[7] = {0, 0, 0, 0, 0, 0, 0};
// Create I2C bus
int fd;
char *bus = "/dev/i2c-1";
if ((fd = open(bus, O_RDWR)) < 0)
{
printf("Failed to open the bus. \n");
exit(1);
}
// Get I2C device,
ioctl(fd, I2C_SLAVE, 0x38);
char TriggerCMD[3] = {0};
TriggerCMD[0] = 0xAC;
TriggerCMD[1] = 0x33;
TriggerCMD[2] = 0x00;
write(fd, TriggerCMD, 3);
sleep(1);
if (read(fd, rawData, 7) != 7)
{
printf("Error : Input/Output Error \n");
}
else
{
uint32_t humidity = rawData[1]; //20-bit raw humidity data
humidity <<= 8;
humidity |= rawData[2];
humidity <<= 4;
humidity |= rawData[3] >> 4;
uint32_t temperature = rawData[3] & 0x0F; //20-bit raw temperature data
temperature <<= 8;
temperature |= rawData[4];
temperature <<= 8;
temperature |= rawData[5];
ahtHum = ((float)humidity / 0x100000) * 100.0;
ahtTemp = (((float)temperature / 0x100000) * 200.0 - 50.0) * 1.8 + 32.0;
printf("%.2f,%.2f\n", ahtHum, ahtTemp);
}
close(fd);
}

DPDK implementation of MPSC ring buffer

While going through the implementation of the DPDK MPSC (multi-produce & single-consumer) Ring Buffer API, i found the code to move the head of the producer for inserting new elements in the Ring buffer. The function is as follows :
static __rte_always_inline unsigned int
__rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
unsigned int n, enum rte_ring_queue_behavior behavior,
uint32_t *old_head, uint32_t *new_head,
uint32_t *free_entries)
{
const uint32_t capacity = r->capacity;
uint32_t cons_tail;
unsigned int max = n;
int success;
*old_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED);
do {
/* Reset n to the initial burst count */
n = max;
/* Ensure the head is read before tail */
__atomic_thread_fence(__ATOMIC_ACQUIRE);
/* load-acquire synchronize with store-release of ht->tail
* in update_tail.
*/
cons_tail = __atomic_load_n(&r->cons.tail,
__ATOMIC_ACQUIRE);
/* The subtraction is done between two unsigned 32bits value
* (the result is always modulo 32 bits even if we have
* *old_head > cons_tail). So 'free_entries' is always between 0
* and capacity (which is < size).
*/
*free_entries = (capacity + cons_tail - *old_head);
/* check that we have enough room in ring */
if (unlikely(n > *free_entries))
n = (behavior == RTE_RING_QUEUE_FIXED) ?
0 : *free_entries;
if (n == 0)
return 0;
*new_head = *old_head + n;
if (is_sp)
r->prod.head = *new_head, success = 1;
else
/* on failure, *old_head is updated */
success = __atomic_compare_exchange_n(&r->prod.head,
old_head, *new_head,
0, __ATOMIC_RELAXED,
__ATOMIC_RELAXED);
} while (unlikely(success == 0));
return n;
}
The load and compare exchange of the producer's head is done using __ATOMIC_RELAXED memory ordering. Isn't this a problem when multiple producers from different threads produce to the queue. Or am I missing something?
https://doc.dpdk.org/guides/prog_guide/ring_lib.html describes the basic mechanism that DPDK uses for implementing the Ring buffer.

How to read data from MPU6050 using STM32F4

I need to monitorate the acelleration of a object. I'm using the MPU6050(accelerometer and gyroscope) and the controller STM32F401RBT6. The code below is the solution that i'm using for this.
#define MPU6050_ADDR 0xD0
#define SMPLRT_DIV_REG 0x19
#define GYRO_CONFIG_REG 0x1B
#define ACCEL_CONFIG_REG 0x1C
#define ACCEL_XOUT_H_REG 0x3B
#define TEMP_OUT_H_REG 0x41
#define GYRO_XOUT_H_REG 0x43
#define PWR_MGMT_1_REG 0x6B
#define WHO_AM_I_REG 0X75
uint16_t Accel_X_RAW,Accel_Y_RAW,Accel_Z_RAW;
uint16_t Ax,Ay,Az;
char buffer[10];
void MPU6050_Init(void)
{
uint8_t check, data;
HAL_I2C_Mem_Read(&hi2c3,MPU6050_ADDR,WHO_AM_I_REG,1,&check,1,100);
if(check == 104)
{
data = 0x07;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,SMPLRT_DIV_REG,1,&data,1,50);
HAL_Delay(50);
data = 0x00;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,ACCEL_CONFIG_REG,1,&data,1,50);
HAL_Delay(50);
data = 0x00;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,GYRO_CONFIG_REG,1,&data,1,50);
HAL_Delay(50);
data = 0;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,PWR_MGMT_1_REG,1,&data,1,50);
HAL_Delay(50);
}
}
void MPU6050_Read_Accel(void)
{
uint8_t recData[6];
for(int i=0;i<6;i++) recData[i] = 0;
HAL_I2C_Mem_Read(&hi2c3,MPU6050_ADDR,ACCEL_XOUT_H_REG,I2C_MEMADD_SIZE_8BIT,recData,6,100);
HAL_Delay(50);
uint16_t dataConvert1,dataConvert2;
dataConvert1 = (uint16_t)(0x0000 | recData[0]) << 8;
dataConvert2 = (uint16_t)(0x0000 | recData[1]);
Accel_X_RAW = dataConvert1 | dataConvert2;
dataConvert1 = (uint16_t)(0x0000 | recData[2]) << 8;
dataConvert2 = (uint16_t)(0x0000 | recData[3]);
Accel_Y_RAW = dataConvert1 | dataConvert2;
dataConvert1 = (uint16_t)(0x0000 | recData[4]) << 8;
dataConvert2 = (uint16_t)(0x0000 | recData[5]);
Accel_Z_RAW = dataConvert1 | dataConvert2;
Ax = (uint16_t)(Accel_X_RAW / 16384);
Ay = (uint16_t)(Accel_Y_RAW / 16384);
Az = (uint16_t)(Accel_Z_RAW / 16384);
}
int main(void)
{
HAL_Init();
SystemClock_Config();
MX_I2C3_Init();
MX_GPIO_Init();
MX_USB_DEVICE_Init();
MPU6050_Init();
while (1)
{
MPU6050_Read_Accel();
sprintf(buffer, "%d / ", Accel_X_RAW);
CDC_Transmit_FS((char*)buffer,10);
}
}
I already did it on ATMEL Controler (Arduino) and it worked, but not on STM32.
I am trying to read the value of X Axis and show it using the USB CDC. This code sets a value for the `` `Accel_X_RAW```` variable between 0 and 65535. In Arduino, the reference value was 32768 when the object was stopped, but reading with STM32 remains at the maximum value (65535) if don't have movement. I don't know what's wrong with this code, I tried many options, but it still doesn't work. Can you help me please.

According to the MPU6050 datasheet, the 16-bit values for acceleration and gyroscope are returned in the signed 2's complement form (it detects acceleration values in the range +-g). As you are receiving signed data in the unsigned variables, the result is not what you expect. Therefore, replace all uint16_t datatypes with int16_t.
The reason why you are getting 65535 value; the hex value of -1 in signed int16_t form is 0xFFFF. However, if you store it in the uint16_t variable, it will be read as 65535. I am assuming that the default acceleration value at rest is -1g.
#include <stdlib.h> /* For using memset */
#define MPU6050_ADDR 0xD0
#define SMPLRT_DIV_REG 0x19
#define GYRO_CONFIG_REG 0x1B
#define ACCEL_CONFIG_REG 0x1C
#define ACCEL_XOUT_H_REG 0x3B
#define TEMP_OUT_H_REG 0x41
#define GYRO_XOUT_H_REG 0x43
#define PWR_MGMT_1_REG 0x6B
#define WHO_AM_I_REG 0X75
int16_t Accel_X_RAW,Accel_Y_RAW,Accel_Z_RAW;
int16_t Ax,Ay,Az;
char buffer[10];
void MPU6050_Init(void)
{
uint8_t check, data;
HAL_I2C_Mem_Read(&hi2c3,MPU6050_ADDR,WHO_AM_I_REG,1,&check,1,100);
if(check == 104)
{
data = 0x07;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,SMPLRT_DIV_REG,1,&data,1,50);
HAL_Delay(50);
data = 0x00;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,ACCEL_CONFIG_REG,1,&data,1,50);
HAL_Delay(50);
data = 0x00;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,GYRO_CONFIG_REG,1,&data,1,50);
HAL_Delay(50);
data = 0;
HAL_I2C_Mem_Write(&hi2c3,MPU6050_ADDR,PWR_MGMT_1_REG,1,&data,1,50);
HAL_Delay(50);
}
}
void MPU6050_Read_Accel(void)
{
uint8_t recData[6];
//for(int i=0;i<6;i++) recData[i] = 0;
memset(recData, 0, sizeof(recData));
HAL_I2C_Mem_Read(&hi2c3,MPU6050_ADDR,ACCEL_XOUT_H_REG,I2C_MEMADD_SIZE_8BIT,recData,6,100);
HAL_Delay(50);
Accel_X_RAW = (int16_t)(recData[0] << 8 | recData[1]);
Accel_Y_RAW = (int16_t)(recData[2] << 8 | recData[3]);
Accel_Z_RAW = (int16_t)(recData[4] << 8 | recData[5]);
Ax = (int16_t)(Accel_X_RAW / 16384);
Ay = (int16_t)(Accel_Y_RAW / 16384);
Az = (int16_t)(Accel_Z_RAW / 16384);
}
int main(void)
{
HAL_Init();
SystemClock_Config();
MX_I2C3_Init();
MX_GPIO_Init();
MX_USB_DEVICE_Init();
MPU6050_Init();
while (1)
{
MPU6050_Read_Accel();
sprintf(buffer, "%d / ", Accel_X_RAW);
CDC_Transmit_FS((char*)buffer,10);
}
}

FreeRTOS getting the current time

I have code that runs with FreeRTOS and I want to edit it,
its a code that measure the pressure and the temperature, and I want to have the time when these measures are token.
Could anyone tell me how to get the current time in my machine or the date?
Thank you.
This is the code that I am using right now.
//#include <stdio.h>
//#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include "platform.h"
#include "printf.h"
#include "lps331ap.h"
static void app_task(void *);
int main(int argc, char *argv[])
{
// Initialize the platform
platform_init();
// Create a task for the application
xTaskCreate(app_task, (const signed char * const) "lps331", configMINIMAL_STACK_SIZE, NULL, 1, NULL);
// Run
platform_run();
return 0;
}
static void app_task(void *param)
{
uint32_t pres;
int16_t temp;
int count=0;
// FILE* fichier = NULL;
printf("# Testing LPS331AP\n");
printf("# Initializing LPS331AP...\n");
lps331ap_powerdown();
printf("# Setting LPS331AP pressure sensor\n");
lps331ap_set_datarate(LPS331AP_P_12_5HZ_T_12_5HZ);
while (1)
{
lps331ap_read_pres(&pres);
lps331ap_read_temp(&temp);
//fichier = fopen("test.txt", "w");
//fprintf(fichier,"%f", pres / 4096.0);
//fprintf(fichier,"%f", 42.5 + temp / 480.0 );
printf("%d\t",count);
printf("%f\t", 42.5 + temp / 480.0);
printf("%f\n", pres / 4096.0);
count=count+1;
//fclose(fichier);
//vTaskDelay(configTICK_RATE_HZ / 10);
vTaskDelay(2000);
//vTaskDelay(600000);
}
}

For measuring time, there's xTaskGetTickCount, but this will be limited to the resolution of your tick rate.
Alternatively, you can create another task that ticks at 1 Hz to increment a counter and use that as system time.
To get an actual date however, you'll need to consult your development board and see if it has an RTC. Otherwise, you'll need to get the time manually somehow and maintain this count using your development board's hardware timers.
There's bound to be some hardware clocks in your development board that you can use to measure time as well.

How to use Backup SRAM as EEPROM in STM32F4

There are two ways of emulating EEPROM on the STM32F4:
On-chip 4 Kbytes backup SRAM
On-chip Flash, with specific software algorithm
The second option is described here: AN3969.
But google, unfortunately, hasn't been able to provide information on how to use the first option - using the 4Kb of backup SRAM as EEPROM?..
Can anyone help on the topic?

must do these:
Enable the PWR clock
RCC_APB1PeriphClockCmd(RCC_APB1Periph_PWR, ENABLE);
Enable access to the backup domain
PWR_BackupAccessCmd(ENABLE);
Enable backup SRAM Clock
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
Enable the Backup SRAM low power Regulator to retain it's content in VBAT mode
PWR_BackupRegulatorCmd(ENABLE);
and you can write/read datas to sram (these codes from BKP_Domain codes in STM32F4xx_DSP_StdPeriph_Lib) (in my mcu stm32f417 BKPSRAM_BASE=0x40024000)
// Write to Backup SRAM with 32-Bit Data
for (i = 0x0; i < 0x100; i += 4) {
*(__IO uint32_t *) (BKPSRAM_BASE + i) = i;
}
// Check the written Data
for (i = 0x0; i < 0x100; i += 4) {
if ((*(__IO uint32_t *) (BKPSRAM_BASE + i)) != i){
errorindex++;
}
}
then if you want:
// Wait until the Backup SRAM low power Regulator is ready
while(PWR_GetFlagStatus(PWR_FLAG_BRR) == RESET)
{}
you can find these functions in STM32F4xx_DSP_StdPeriph_Lib.

after reading through the Reference Manual for stm32f4 and the stm32f405xx/stm32f407xx datasheet I agree that it isn't clear how to actually use the backup sram (or where it is located). Here is what I found. Both the RTC registers and backup SRAM contain some amount of storage that is maintained as long as you have battery power. The RTC contains 20 registers (80 bytes) and the backup sram (which is its own peripheral on AHB1 and located within the register address region) contains 0x1000 (4096 bytes). Neither are enabled by default.
in DM00037051 (stm32f405xx/stm32f407xx datasheet, p29):
The 4-Kbyte backup SRAM is an EEPROM-like memory area. It can be used to store
data which need to be retained in VBAT and standby mode. This memory area is
disabled by default to minimize power consumption (see Section 2.2.19:
Low-power modes). It can be enabled by software.
The backup registers are 32-bit registers used to store 80 bytes of user
application data when VDD power is not present. Backup registers are not reset
by a system, a power reset, or when the device wakes up from the Standby mode
(see Section 2.2.19: Low-power modes).
on page 71 of datasheet and p65 of the reference manual
AHB1 | 0x4002 4000 - 0x4002 4FFF | BKPSRAM
and page 73 of the datatasheet and p67 of the reference manual
APB1 | 0x4000 2800 - 0x4000 2BFF | RTC & BKP Registers
Page 118-119 of the reference manual contains information on enabling the backup SRAM and RTC registers.
NOTE: if you are already using the RTC in the backup domain and only need to store <= 80 bytes, then you are better off using the RTC backup registers because enabling the backup SRAM will basically double the current consumption (see table 25 in the stm32f405/7 datasheet).
here are my write and read functions for both backup SRAM and backup RTC registers
int8_t write_to_backup_sram( uint8_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 0x1000;
uint8_t* base_addr = (uint8_t *) BKPSRAM_BASE;
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
}
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
/* disable backup domain write protection */
RCC_APB1PeriphClockCmd(RCC_APB1Periph_PWR, ENABLE); // set RCC->APB1ENR.pwren
PWR_BackupAccessCmd(ENABLE); // set PWR->CR.dbp = 1;
/** enable the backup regulator (used to maintain the backup SRAM content in
* standby and Vbat modes). NOTE : this bit is not reset when the device
* wakes up from standby, system reset or power reset. You can check that
* the backup regulator is ready on PWR->CSR.brr, see rm p144 */
PWR_BackupRegulatorCmd(ENABLE); // set PWR->CSR.bre = 1;
for( i = 0; i < bytes; i++ ) {
*(base_addr + offset + i) = *(data + i);
}
PWR_BackupAccessCmd(DISABLE); // reset PWR->CR.dbp = 0;
return 0;
}
int8_t read_from_backup_sram( uint8_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 0x1000;
uint8_t* base_addr = (uint8_t *) BKPSRAM_BASE;
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
}
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
for( i = 0; i < bytes; i++ ) {
*(data + i) = *(base_addr + offset + i);
}
return 0;
}
int8_t write_to_backup_rtc( uint32_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 80;
volatile uint32_t* base_addr = &(RTC->BKP0R);
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
} else if( offset % 4 || bytes % 4 ) {
/* ERROR: data start or num bytes are not word aligned */
return -2;
} else {
bytes >>= 2; /* divide by 4 because writing words */
}
/* disable backup domain write protection */
RCC_APB1PeriphClockCmd(RCC_APB1Periph_PWR, ENABLE); // set RCC->APB1ENR.pwren
PWR_BackupAccessCmd(ENABLE); // set PWR->CR.dbp = 1;
for( i = 0; i < bytes; i++ ) {
*(base_addr + offset + i) = *(data + i);
}
PWR_BackupAccessCmd(DISABLE); // reset PWR->CR.dbp = 0;
// consider also disabling the power peripherial?
return 0;
}
int8_t read_from_backup_rtc( uint32_t *data, uint16_t bytes, uint16_t offset ) {
const uint16_t backup_size = 80;
volatile uint32_t* base_addr = &(RTC->BKP0R);
uint16_t i;
if( bytes + offset >= backup_size ) {
/* ERROR : the last byte is outside the backup SRAM region */
return -1;
} else if( offset % 4 || bytes % 4 ) {
/* ERROR: data start or num bytes are not word aligned */
return -2;
} else {
bytes >>= 2; /* divide by 4 because writing words */
}
/* read should be 32 bit aligned */
for( i = 0; i < bytes; i++ ) {
*(data + i) = *(base_addr + offset + i);
}
return 0;
}

I had to jump from main program to bootloader on user request.
So I put some 'magic number' into BKPSRAM in main program, do CPU soft reset.
Bootloader always starts first.
It checks for 'magic number' if it is present, it executes, else starts main program
when using HAL this is how to jump to bootloader:
__HAL_RCC_PWR_CLK_ENABLE();
HAL_PWR_EnableBkUpAccess();
__BKPSRAM_CLK_ENABLE();
*(__IO uint8_t *)0x40024000 = 42;//magic number
HAL_NVIC_SystemReset();
inside bootloader to read magic number it is enough to enable backup sram clock only (bootloader uses StdPeriphDriver).
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
extRequest = *(__IO uint8_t *)0x40024000;
if(extRequest == 42)
//run bootloader
cpu is stm32f407

Here is the example of HAL library to use backup SRAM.
#define WRITE_READ_ADDR 0x01 //offset value.you can change according to your application
uint32_t write_arr = 0xA5A5A5A6;
uint32_t read_arr;
int main()
{
enable_backup_sram();
writeBkpSram(write_arr);
while(1)
{
read_arr = readBkpSram();
}
}
void enable_backup_sram(void)
{
/*DBP : Enable access to Backup domain */
HAL_PWR_EnableBkUpAccess();
/*PWREN : Enable backup domain access */
__HAL_RCC_PWR_CLK_ENABLE();
/*BRE : Enable backup regulator
BRR : Wait for backup regulator to stabilize */
HAL_PWREx_EnableBkUpReg();
/*DBP : Disable access to Backup domain */
HAL_PWR_DisableBkUpAccess();
}
void writeBkpSram(uint32_t l_data)
{
/* Enable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_ENABLE();
/* Pointer write on specific location of backup SRAM */
(uint32_t *) (BKPSRAM_BASE + WRITE_READ_ADDR) = l_data;
/* Disable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_DISABLE();
}
uint32_t readBkpSram(void)
{
uint32_t i_retval;
/* Enable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_ENABLE();
/* Pointer write from specific location of backup SRAM */
i_retval = *(uint32_t*) (BKPSRAM_BASE + WRITE_READ_ADDR);
/* Disable clock to BKPSRAM */
__HAL_RCC_BKPSRAM_CLK_DISABLE();
return i_retval;
}

I'm currently using the an STM32F2xx microcontroller. According to the datasheet:
The 4-Kbyte backup SRAM is an EEPROM-like area.
To retain the content of the RTC backup registers … when VDD is turned off, VBAT pin can be connected to an optional standby voltage supplied by a battery or by another source.
A supercap, for example, would be required to maintain the contents of the backup registers while the microcontroller is powered off.
Also, according to the document:
After reset, the backup domain (… backup SRAM) is protected against possible unwanted write accesses. To enable access to the backup domain, proceed as follows …
It gives you instructions on how to gain access to the backup domain by directly writing to the certain peripheral register. If you have access to the STM32F4xx library, you can call something like this (note: I'm using the STM32F2xx library):
PWR_BackupAccessCmd(ENABLE);
Note: There's is more to it than simply calling the above function, such as enabling the backup SRAM interface clock. Consult the STM32F4 series documentation.
There is a lot of documentation embedded in the library source that is invaluable and if it's available should be read.
On the STM32F2 series microcontroller, SRAM is located at the following memory address range:
0x40024000 - 0x40024FFF
And can be written to somewhere at location, for example, as follows:
#define VAR_LOC ((volatile uint8_t *)(0x40024000))
volatile uint8_t *pVar = VAR_LOC;
*pVar = 5;

Useable example
In header:
//------------------------------------
typedef struct
{
uint32_t isDefault; //must by 0x12345678
uint32_t LastTestNumber;
uint32_t LastUserNumber;
uint32_t LastModeTest;
uint32_t calibv;
uint32_t calibc;
uint32_t WorkTime;
int32_t RTCCalib;
uint32_t LCDContrast;
} sBKPSRAM;
extern sBKPSRAM *BKPSRAM;// = (sSDRAM *)SDRAM_BANK_ADDR;
//------------------------------------
In code head
define as data:
sBKPSRAM *BKPSRAM = (sBKPSRAM *)BKPSRAM_BASE;
In Init:
void main(void)
{
(....)
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_BKPSRAM, ENABLE);
PWR_BackupAccessCmd(ENABLE);
PWR_BackupRegulatorCmd(ENABLE);
ifDefault();
(....)
}
In procedure:
//-------------------------------------------------
void ifDefault(void)
{
if (BKPSRAM->LastModeTest!=0x12345678)
{
printf("BKPSRAM to default\r\n");
memset(BKPSRAM,0,sizeof(sBKPSRAM));
BKPSRAM->calibv =66920;
BKPSRAM->calibc =79230;
BKPSRAM->RTCCalib =1;
BKPSRAM->LCDContrast =2;
BKPSRAM->LastModeTest =0x12345678;
}
}
//-------------------------------------------------

HAL Configuration for STM32H7 to access backup SRAM:
#define BKP_RAM (*(__IO uint32_t *) (D3_BKPSRAM_BASE)) //Start address: 0x38800000
Main() {
__HAL_RCC_BKPRAM_CLK_ENABLE();
HAL_PWREx_EnableBkUpReg();
BKP_RAM = 0xA5AA5A55;
}
In addition to that, you need to add a below line in systemInit() to enable write-through access to Backup SRAM.
SCB->CACR |= 1<<2;

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse