1
votes

I am trying to find the physical PCIe address space memory locations of GPU memory to support inbound DMA initiated by an external PCIe resource such as an FPGA (similar to How to get physical address of GPU memory for DMA? (OpenCL)). We are specifically trying to avoid GPU-mastered operations or moving data indirectly through host memory.

I have a linux virtual memory address to physical memory addresses translation (see code below) which works great for decoding mmap including to other PCIe devices, and of course I know the physical addresses of the GPUs as reported by the "Region" in lspci -vvv

I have tried instrumenting the CUDA Sample memMapIPCDrv to examine the virtual and physical addresses for the output of cuMemMap and cuMemCreate; and I have tried a looking at the id(obj) and obj.gpu_data.device_ctypes_pointer.value virtual and physical memory addresses from python numba. In all cases, the resulting physical addresses are in host memory, not from the GPU address space. I'll include the python test case below.

Any ideas on how to find the actual on-GPU physical PCIe address?

Example code below uses GPU memory and then checks to see where associated physical memory is mapped to:


import numpy as np
import sys
import numba
from numba import cuda
import socket
import struct
from numba.cuda.cudadrv import driver as _driver
import re
import os

# Prepare for virtual to physical address translation
mySocket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
mySocket.connect("/run/pagemapd")

def do_translate(a):
    """Virtual to Physical Address translation"""
    b = struct.pack('Q',a)
    mySocket.sendall(b)
    data = mySocket.recv(8)
    return struct.unpack("Q", data)[0]

def hexsep(n):
    """Convert int to hex and then seperate digits into groups"""
    return _hexsep(hex(n))

def _hexsep(H):
    """Seperate hex digits into groups (of 4, _ separated)"""
    h = ''
    while h != H:
        h = H
        H = re.sub(r'([0-9a-f]+)([0-9a-f]{4})(?:\b|_)', r'\1_\2', h)
    return(h)

def p_translate(a):
    """Translate virtual address to physical and return both as string"""
    pa = do_translate(a)
    return f"v{hexsep(a)}->p{hexsep(pa)}"

if numba.cuda.is_available() is False:
    print("no CUDA GPU found")
    sys.exit()

# Create array, push it to GPU
n = np.ones(8192,dtype=np.uint8)*7
ngpu = cuda.mapped_array_like(n)
narr = numba.cuda.as_cuda_array(ngpu)
narr.copy_to_device(n)

# Print relevant physical and virtual addresses, and where memory lives
print(f"n    addresses are: id(n)={p_translate(id(n))},  id(n.data)={p_translate(id(n.data))}, Memory is {'on' if _driver.is_device_memory(n) else 'off'} GPU")
print(f"ngpu addresses are: id(ngpu)={p_translate(id(ngpu))},  id(ngpu.data)={p_translate(id(ngpu.data))}, ngpu.gpu_data={p_translate(ngpu.gpu_data.device_ctypes_pointer.value)}, Memory is {'on' if _driver.is_device_memory(ngpu) else 'off'} GPU")
print(f"narr addresses are: id(narr)={p_translate(id(narr))},  narr.gpu_data={p_translate(narr.gpu_data.device_ctypes_pointer.value)}, Memory is {'on' if _driver.is_device_memory(ngpu) else 'off'} GPU")

# Print contents of array
print("ngpu",ngpu)
nn = narr.copy_to_host()
print("copy",nn)

# Set environmental variable DEBUG to anything to print out the mapping table, to see where addresses came from
if "DEBUG" in os.environ:
    with open("/proc/self/maps") as R:
        for l in R:
            m = re.search(r'^([0-9a-f]{4,})-([0-9a-f]{4,})(.*)', l)
            if m:
                print(_hexsep(m.group(1))+"-"+_hexsep(m.group(2))+m.group(3),sep='')
            else:
                print(l,sep='')
// Service (run as root) translating virtual address to physical

#define _GNU_SOURCE
#include <sys/socket.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <popt.h>
#include <arpa/inet.h>
#include <ctype.h>
#include <err.h>
#include <linux/un.h>
#include <pthread.h>
#include <sys/stat.h>

#define SPATH "/run/pagemapd"
long pagesize;

// Handle a particular client connection, performing translation
static void *thread_start(void *arg)
{
  char buf[sizeof(uintptr_t)*6];
  struct ucred ucred;
  int ucredlen = sizeof(ucred);
  pid_t targetpid = *(pid_t *)buf;
  uintptr_t targetaddr = ((uintptr_t *)buf)[1];
  int cfd = *(int *)arg;
  int pfd = -1;

  if (getsockopt(cfd, SOL_SOCKET, SO_PEERCRED, &ucred, &ucredlen) < 0)
  {
    warn("Cannot getsockopt");
    goto endf;
  }

  snprintf(buf, sizeof(buf), "/proc/%d/pagemap", ucred.pid);
  if ((pfd = open(buf, O_RDONLY)) < 0)
  {
    warn("Cannot open pagemap for %d from %s", ucred.pid, buf);
    goto endf;
  }

  while (read(cfd, buf, sizeof(buf)) > 0)
  {
    uintptr_t targetaddr = *(uintptr_t *)buf;
    uintptr_t entry = 0;

    if (lseek(pfd, (uintptr_t)targetaddr / pagesize * sizeof(uintptr_t), SEEK_SET) < 0)
    {
      warn("Cannot seek for %d", ucred.pid);
      goto endf;
    }

    if (read(pfd, &entry, sizeof(entry)) != sizeof(entry))
    {
      warn("Cannot read for %d", ucred.pid);
      goto endf;
    }

    targetaddr = (((entry & 0x7fffffffffffffULL) * pagesize) + (((uintptr_t)targetaddr) % pagesize));

    if (write(cfd, &targetaddr, sizeof(targetaddr)) != sizeof(targetaddr))
    {
      warn("Cannot write for %d", ucred.pid);
      goto endf;
    }
  }
 endf:
  close(cfd);
  if (pfd >= 0)
    close(pfd);
}

// Set up socket and create thread per connection
int main()
{
  int fd = socket(PF_UNIX, SOCK_STREAM, 0);
  if (fd < 0)
    err(1, "socket");

  pagesize = sysconf(_SC_PAGESIZE);

  struct sockaddr_un address, client;
  int clientlen = sizeof(client);
  memset(&address, 0, sizeof(struct sockaddr_un));

  unlink(SPATH);
  address.sun_family = AF_UNIX;
  snprintf(address.sun_path, UNIX_PATH_MAX, SPATH);
  if (bind(fd, (struct sockaddr *)&address, sizeof(address)) < 0)
    err(1, "bind");

  chmod(SPATH, 0666);
  if (listen(fd, 5) < 0)
    err(1, "listen");

  char buf[sizeof(uintptr_t)*2];
  struct ucred ucred;
  int clientfd;
  while ((clientfd = accept(fd, (struct sockaddr *)&client, &clientlen)) >= 0)
  {
    pthread_t child;
    if (pthread_create(&child, NULL, thread_start, &clientfd) < 0)
      err(2, "pthread_create");
  }
}
1

1 Answers

1
votes

Exposing CUDA memory for PCIe access requires kernel-driver calls. You can find a detailed explanation on how to do that in the GPUDirect RDMA documentation.