I directly access the host mem in the cuda kernel, and found no error, why is this?
I tried to get smarter from the documentation.
Allocates size bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual memory ranges allocated with this function and automatically accelerates calls to functions such as cudaMemcpy*(). Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc().
Why do many cuda programs add cudaMemcpy after cudaMallocHost?
#include <stdio.h>
#include <assert.h>
#define N 64
// cuda kernel access host mem a/b
__global__ void gpu(int *a, int *b, int *c_gpu) {
int r = blockDim.x * blockIdx.x + threadIdx.x;
int c = blockDim.y * blockIdx.y + threadIdx.y;
if (r < N && c < N) {
c_gpu[r * N + c] = a[r * N + c] + b[r * N + c];
}
}
// cpu function
void cpu(int *a, int *b, int *c_cpu) {
for (int r = 0; r < N; r++) {
for (int c = 0; c < N; c++) {
c_cpu[r * N + c] = a[r * N + c] + b[r * N + c];
}
}
}
int main() {
int *a, *b, *c_cpu, *c_gpu, *c_gpu_cpu;
size_t size = N * N * sizeof(int);
cudaMallocHost(&a, size);
cudaMallocHost(&b, size);
cudaMallocHost(&c_cpu, size);
cudaMallocHost(&c_gpu_cpu, size);
cudaMalloc(&c_gpu, size);
for (int r = 0; r < N; r++) {
for (int c = 0; c < N; c++) {
a[r * N + c] = r;
b[r * N + c] = c;
c_gpu_cpu[r * N + c] = 0;
c_cpu[r * N + c] = 0;
}
}
cpu(a, b, c_cpu);
dim3 threads(16, 16, 1);
dim3 blocks((N + threads.x - 1) / threads.x, (N + threads.y - 1) / threads.y, 1);
gpu<<<blocks, threads>>>(a, b, c_gpu); // access cpu host mem
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
}
cudaDeviceSynchronize();
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c_cpu);
cudaFreeHost(c_gpu_cpu);
cudaFree(c_gpu);
}