0
votes

I have written a sample program to understand the effects of GPU/CPU pinned memory and Heap memory. The following code illustrates this. I have allocated three buffers of dimensions say 1280x720. I have filled buffers 1 & 2 with some data and in turn used these buffers to fill buffer 3. The mathematical operatio involved in filling buffer 3 is insignificant. In case 1, the memory allocated from these buffers are from heap (malloc call). In case 2, the memory for these buffers are allocated from OpenCL API calls (clCreateBuffer()). There is a performance difference between these 2 cases. I tested it on Intel integrated GPU's. I am unable to explain this difference in performance. Does it have some thing to do with cacheable properties of CPU/GPU pinned memory vs Heap memory.

Have you encountered such behavior before or am i doing something wrong?

#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>

#define OPENCL

#if defined(_WIN32)
/*
 * Win32 specific includes
 */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#else
#include <sys/time.h>

/* timersub is not provided by msys at this time. */
#ifndef timersub
#define timersub(a, b, result) \
    do { \
      (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
      (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
      if ((result)->tv_usec < 0) { \
        --(result)->tv_sec; \
        (result)->tv_usec += 1000000; \
      } \
    } while (0)
#endif
#endif


struct usec_timer {
#if defined(_WIN32)
  LARGE_INTEGER  begin, end;
#else
  struct timeval begin, end;
#endif
};


static void usec_timer_start(struct usec_timer *t) {
#if defined(_WIN32)
  QueryPerformanceCounter(&t->begin);
#else
  gettimeofday(&t->begin, NULL);
#endif
}


static void usec_timer_mark(struct usec_timer *t) {
#if defined(_WIN32)
  QueryPerformanceCounter(&t->end);
#else
  gettimeofday(&t->end, NULL);
#endif
}


static int64_t usec_timer_elapsed(struct usec_timer *t) {
#if defined(_WIN32)
  LARGE_INTEGER freq, diff;

  diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;

  QueryPerformanceFrequency(&freq);
  return diff.QuadPart * 1000000 / freq.QuadPart;
#else
  struct timeval diff;

  timersub(&t->end, &t->begin, &diff);
  return diff.tv_sec * 1000000 + diff.tv_usec;
#endif
}


#ifdef OPENCL
#include ".\CL\cl.h"

int opencl_init(cl_context *context, cl_command_queue *cmd_queue) {
  cl_int status;
  cl_uint num_platforms = 0;
  cl_platform_id platform;
  cl_uint num_devices = 0;
  cl_device_id device;
  cl_command_queue_properties command_queue_properties = 0;

  // Get the number of platforms in the system.
  status = clGetPlatformIDs(0, NULL, &num_platforms);
  if (status != CL_SUCCESS || num_platforms == 0)
    goto fail;

  // Get the platform ID for one platform
  status = clGetPlatformIDs(1, &platform, NULL);
  if (status != CL_SUCCESS)
    goto fail;

  // Get the number of devices available on the platform
  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
  if (status != CL_SUCCESS || num_devices == 0)
    goto fail;

  // Get the device ID for one device
  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  if (status != CL_SUCCESS)
    goto fail;

  // Create OpenCL context for one device
  *context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  if (status != CL_SUCCESS || *context == NULL)
    goto fail;

  // Create command queues for the device
  *cmd_queue = clCreateCommandQueue(*context, device, command_queue_properties, &status);
  if (status != CL_SUCCESS || *cmd_queue == NULL)
    goto fail;
  return 0;

fail:
  return 1;
}
#endif

int main(int argc, char **argv) {
  int x, y, z;
  int width = 1280, height = 720;
  unsigned char *buffer[3];
  int use_gpu;
  cl_mem opencl_mem[3];
  cl_context context;
  cl_command_queue cmd_queue;
  cl_int status;

  if (argc != 2)
    return 0;

  use_gpu = atoi(argv[1]);

  if (use_gpu) {
    if (opencl_init(&context, &cmd_queue))
      printf("OpenCL init failure");
  }

  if (use_gpu) {
    for (x = 0; x < 3; x++) {
      opencl_mem[x] = clCreateBuffer(context,
                                     CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                     width * height * sizeof(*buffer[x]), NULL,
                                     &status);
      if (status != CL_SUCCESS)
        return 0;
      buffer[x] = clEnqueueMapBuffer(cmd_queue, opencl_mem[x], CL_TRUE,
                                     CL_MAP_READ | CL_MAP_WRITE, 0,
                                     width * height * sizeof(*buffer[x]), 0,
                                     NULL, NULL, &status);
      if (status != CL_SUCCESS) {
        clReleaseMemObject(opencl_mem[x]);
        opencl_mem[x] = NULL;
        return 0;
      }
    }
  } else {
    for (x = 0; x < 3; x++) {
      buffer[x] = malloc(width * height * sizeof(*buffer[x]));
      if (buffer[x] == NULL) {
        printf("Unable to alloc memory");
      }
    }
  }

  memset(buffer[0], 1, width * height * sizeof(*buffer[0]));
  memset(buffer[1], 2, width * height * sizeof(*buffer[1]));
  memset(buffer[2], 0, width * height * sizeof(*buffer[2]));

  {
    struct usec_timer emr_timer;
    usec_timer_start(&emr_timer);
    for (z = 0; z < 600; z++) {
      for (y = 0; y < height; y++) {
        for (x = 0; x < width; x++) {
          // don't worry about overflows
          buffer[2][y * width + x] += buffer[0][y * width + x]
                                     + buffer[1][y * width + x];
        }
      }
    }
    usec_timer_mark(&emr_timer);
    printf("Elapsed time %"PRIu64"\n", usec_timer_elapsed(&emr_timer));
  }

  if (use_gpu) {
    for (x = 0; x < 3; x++) {
      if (buffer[x] != NULL) {
        status = clEnqueueUnmapMemObject(cmd_queue, opencl_mem[0], buffer[0], 0,
                                         NULL, NULL);
        status |= clFinish(cmd_queue);
        if (status != CL_SUCCESS)
          return 0;
        buffer[0] = NULL;
      }

      if (opencl_mem[0] != NULL) {
        status = clReleaseMemObject(opencl_mem[0]);
        if (status != CL_SUCCESS)
          return 0;
        opencl_mem[0] = NULL;
      }
    }

    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
  } else {
    for (x = 0; x < 3; x++) {
      free(buffer[x]);
      buffer[x] = NULL;
    }
  }
  return 0;
}
1

1 Answers

2
votes

If you use malloc + operation + free you are using only CPU resources.

If you use OpenCL you are using CPU + GPU, and you involve in syncronization and data copy penalties.

  • Alloc in GPU
  • Map to CPU space (allocs another buffer in CPU)
  • Operate CPU buffer
  • Unmap (pinned copy to the GPU buffer + deallocate the CPU one).
  • Destroy GPU buffer

What makes you think it should have the same speed? Of course is more costly, and will always be. You are doing the same CPU operation + some extra OpenCL operations.

Pinned memory is faster than non-pinned memory in transfers, but it is never faster than non copy, because you simply are not copying anything!

Also for a memory benchmark, doing operation with 3*1280*720 = 2.6MB, is completely silly. It would take just microseconds in common systems. And anyway, that part should be the same for both cases.

The overhead will dominate your results, rather than the throughput.