I am trying to find a PMC (Performance Monitoring Counter) that will display the amount of times that a prefetcht0
instruction hits L1 dcache (or misses).
icelake-client: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
I am trying to make this fine grain i.e (note should include lfence
around prefetcht0
)
xorl %ecx, %ecx
rdpmc
movl %eax, %edi
prefetcht0 (%rsi)
rdpmc
testl %eax, %edi
// jump depending on if it was a miss or not
The goal is to check if a prefetch hit L1. If didn't execute some code that is ready, otherwise proceed.
It seems that it will have to be a miss event just based on what is available.
I have tried a few events from libpfm4 and intel manual with no luck:
L1-DCACHE-LOAD-MISSES, emask=0x00, umask=0x10000
L1D.REPLACEMENT, emask=0x51, umask=0x1
L2_RQSTS.SWPF_HIT, emask=0x24, umask=0xc8
L2_RQSTS.SWPF_MISS, emask=0x24, umask=0x28
LOAD_HIT_PREFETCH.SWPF, emask=0x01, umask=0x4c (this very misleadingly is non-sw prefetch hits)
L1D.REPLACEMENT
and L1-DCACHE-LOAD-MISSES
kind of works, it works if I delay the rdpmc
but if they are one after another it seems unreliable at best. The other ones are complete busts.
Questions:
- Should any of these work for detecting if prefetches hit L1 dcache? (i.e my testing is bad)
- If not. Whats events could be used to detect if a prefetch hit L1 dcache?
Edit: MEM_LOAD_RETIRED.L1_HIT
does not appear to work for software prefetch.
Here is the code I am using to do test:
#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#define HIT 0
#define MISS 1
#define TODO MISS
#define PAGE_SIZE 4096
// to force hit make TSIZE low
#define TSIZE 10000
#define err_assert(cond) \
if (__builtin_expect(!(cond), 0)) { \
fprintf(stderr, "%d:%d: %s\n", __LINE__, errno, strerror(errno)); \
exit(-1); \
}
uint64_t
get_addr() {
uint8_t * addr =
(uint8_t *)mmap(NULL, TSIZE * PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
err_assert(addr != NULL);
for (uint32_t i = 0; i < TSIZE; ++i) {
addr[i * PAGE_SIZE + (PAGE_SIZE - 1)] = 0;
#if TODO == HIT
addr[i * PAGE_SIZE] = 0;
#endif
}
return uint64_t(addr);
}
int
perf_event_open(struct perf_event_attr * hw_event,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
return ret;
}
void
init_perf_event_struct(struct perf_event_attr * pe,
const uint32_t type,
const uint64_t ev_config,
int lead) {
__builtin_memset(pe, 0, sizeof(struct perf_event_attr));
pe->type = type;
pe->size = sizeof(struct perf_event_attr);
pe->config = ev_config;
pe->disabled = !!lead;
pe->exclude_kernel = 1;
pe->exclude_hv = 1;
}
/* Fixed Counters */
static constexpr uint32_t core_instruction_ev = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;
static constexpr uint32_t core_cycles_ev = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;
static constexpr uint32_t ref_cycles_ev = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;
/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
int
init_perf_tracking() {
struct perf_event_attr pe;
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_instruction_ev, 1);
int leadfd = perf_event_open(&pe, 0, -1, -1, 0);
err_assert(leadfd >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, ref_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, mem_load_retired_l1_hit, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
return leadfd;
}
void
start_perf_tracking(int leadfd) {
ioctl(leadfd, PERF_EVENT_IOC_RESET, 0);
ioctl(leadfd, PERF_EVENT_IOC_ENABLE, 0);
}
#define _V_TO_STR(X) #X
#define V_TO_STR(X) _V_TO_STR(X)
//#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]), %%eax\n\t"
#endif
int
main() {
int fd = init_perf_tracking();
start_perf_tracking(fd);
uint64_t addr = get_addr();
uint32_t prefetch_miss, cycles_to_detect;
asm volatile(
"lfence\n\t"
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[prefetch_miss]\n\t"
"lfence\n\t"
DO_MEMORY_OP(prefetch_addr)
"lfence\n\t"
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"subl %[prefetch_miss], %%eax\n\t"
"movl %%eax, %[prefetch_miss]\n\t"
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"subl %[cycles_to_detect], %%eax\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
"lfence\n\t"
: [ prefetch_miss ] "=&r"(prefetch_miss),
[ cycles_to_detect ] "=&r"(cycles_to_detect)
: [ prefetch_addr ] "r"(addr), [ core_cycles_idx ] "i"(core_cycles_idx)
: "eax", "edx", "ecx");
fprintf(stderr, "Hit : %d\n", prefetch_miss);
fprintf(stderr, "Cycles : %d\n", cycles_to_detect);
}
if I define DO_PREFETCH
the results for MEM_LOAD_RETIRED.L1_HIT
are always 1 (always appears to get a hit). If I comment out DO_PREFETCH
the results correspond with what I would expect (when the address is clearly not in cache reports miss, when it clearly is reports hit).
With DO_PREFETCH
:
g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit : 1
Cycles : 554
and without DO_PREFETCH
g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit : 0
Cycles : 888
With L2_RQSTS.SWPF_HIT
and L2_RQSTS.SWPF_MISS
was able to get it to work. Big thanks to Hadi Brais. Worth noting that the reason L1D_PEND_MISS.PENDING
didn't work might be related to Icelake. Hadi Brais reported getting it to work for predicting L1D cached misses on Haswell.
In the interest of trying to determine why L1_PEND_MISS.PENDING
and MEM_LOAD_RETIRED.L1_HIT
do not work posted the exact code I'm using for testing them:
#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#define HIT 0
#define MISS 1
#define TODO MISS
#define PAGE_SIZE 4096
#define TSIZE 1000
#define err_assert(cond) \
if (__builtin_expect(!(cond), 0)) { \
fprintf(stderr, "%d:%d: %s\n", __LINE__, errno, strerror(errno)); \
exit(-1); \
}
uint64_t
get_addr() {
uint8_t * addr =
(uint8_t *)mmap(NULL, TSIZE * PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
err_assert(addr != NULL);
__builtin_memset(addr, -1, TSIZE * PAGE_SIZE);
return uint64_t(addr);
}
int
perf_event_open(struct perf_event_attr * hw_event,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
return ret;
}
void
init_perf_event_struct(struct perf_event_attr * pe,
const uint32_t type,
const uint64_t ev_config,
int lead) {
__builtin_memset(pe, 0, sizeof(struct perf_event_attr));
pe->type = type;
pe->size = sizeof(struct perf_event_attr);
pe->config = ev_config;
pe->disabled = !!lead;
pe->exclude_kernel = 1;
pe->exclude_hv = 1;
}
/* Fixed Counters */
static constexpr uint32_t core_instruction_ev = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;
static constexpr uint32_t core_cycles_ev = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;
static constexpr uint32_t ref_cycles_ev = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;
/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
static constexpr uint32_t l1d_pending = 0x0148;
static constexpr uint32_t swpf_hit = 0xc824;
static constexpr uint32_t swpf_miss = 0x2824;
static constexpr uint32_t ev0 = l1d_pending;
#define NEVENTS 1
#if NEVENTS > 1
static constexpr uint32_t ev1 = swpf_miss;
#endif
int
init_perf_tracking() {
struct perf_event_attr pe;
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_instruction_ev, 1);
int leadfd = perf_event_open(&pe, 0, -1, -1, 0);
err_assert(leadfd >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, core_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, ref_cycles_ev, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
init_perf_event_struct(&pe, PERF_TYPE_RAW, ev0, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
#if NEVENTS > 1
init_perf_event_struct(&pe, PERF_TYPE_RAW, ev1, 0);
err_assert(perf_event_open(&pe, 0, -1, leadfd, 0) >= 0);
#endif
return leadfd;
}
void
start_perf_tracking(int leadfd) {
ioctl(leadfd, PERF_EVENT_IOC_RESET, 0);
ioctl(leadfd, PERF_EVENT_IOC_ENABLE, 0);
}
#define _V_TO_STR(X) #X
#define V_TO_STR(X) _V_TO_STR(X)
//#define LFENCE
#ifdef LFENCE
#define SERIALIZER() "lfence\n\t"
#else
#define SERIALIZER() \
"xorl %%ecx, %%ecx\n\t" \
"xorl %%eax, %%eax\n\t" \
"cpuid\n\t"
#endif
#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]), %%eax\n\t"
#endif
int
main() {
int fd = init_perf_tracking();
start_perf_tracking(fd);
uint64_t addr = get_addr();
// to ensure page in TLB
*((volatile uint64_t *)(addr + (PAGE_SIZE - 8))) = 0;
#if TODO == HIT
// loading from 0 offset to check cache miss / hit
*((volatile uint64_t *)addr) = 0;
#endif
uint32_t ecount0 = 0, ecount1 = 0, cycles_to_detect = 0;
asm volatile(
SERIALIZER()
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[ecount0]\n\t"
#if NEVENTS > 1
"movl $1, %%ecx\n\t"
"rdpmc\n\t"
"movl %%eax, %[ecount1]\n\t"
#endif
SERIALIZER()
DO_MEMORY_OP(prefetch_addr)
SERIALIZER()
"xorl %%ecx, %%ecx\n\t"
"rdpmc\n\t"
"subl %[ecount0], %%eax\n\t"
"movl %%eax, %[ecount0]\n\t"
#if NEVENTS > 1
"movl $1, %%ecx\n\t"
"rdpmc\n\t"
"subl %[ecount1], %%eax\n\t"
"movl %%eax, %[ecount1]\n\t"
#endif
"movl %[core_cycles_idx], %%ecx\n\t"
"rdpmc\n\t"
"subl %[cycles_to_detect], %%eax\n\t"
"movl %%eax, %[cycles_to_detect]\n\t"
SERIALIZER()
: [ ecount0 ] "=&r"(ecount0),
#if NEVENTS > 1
[ ecount1 ] "=&r"(ecount1),
#endif
[ cycles_to_detect ] "=&r"(cycles_to_detect)
: [ prefetch_addr ] "r"(addr), [ core_cycles_idx ] "i"(core_cycles_idx)
: "eax", "edx", "ecx");
fprintf(stderr, "E0 : %d\n", ecount0);
fprintf(stderr, "E1 : %d\n", ecount1);
fprintf(stderr, "Cycles : %d\n", cycles_to_detect);
}