I'm attempting a basic matrix multiplication program in OpenCL. I believe my issues are in my enqueue and/or buffer reading, as I am getting completely incorrect output for the result matrix, as well as incorrect first rows for matrices A and B. I'm new to OpenCL and I've been banging my head against this for quite a while now, maybe someone here can give me a hint as to where I'm going wrong?
Host Code:
#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>
/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB
using namespace std;
/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize);
//*************
// Main Program
//*************
int main(int argc, char* argv[]) {
/* Check for valid matrix sizes */
if (numColsA != numRowsB) {
cout << "ERROR: Invalid matrix dimensions." << endl;
} else {
srand(2013); // Set random seed
/* Allocate memory for matrices A, B, and C */
unsigned int sizeA = numRowsA * numColsA;
unsigned int sizeB = numRowsB * numColsB;
unsigned int sizeC = numRowsC * numColsC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/*
Allocate memoryA/memoryB/memoryC size blocks of bytes
(cast from void*)
*/
float* blockA = (float*) malloc(memoryA);
float* blockB = (float*) malloc(memoryB);
float* blockC = (float*) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(blockA, sizeA);
initMatrix(blockB, sizeB);
/* Display matrices A and B */
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
cl_int err; // Error code
string platformVendor; // Platform vendor
/* Create list of platforms */
cl::vector < cl::Platform > platformList;
cl::Platform::get(&platformList);
/*
Display potential Platform list generation error. If the
platform list size does not equal 0, CL_SUCCESS (0) is
sent to the function. If the platform list size does
equal 0, -1 is sent to the function.
*/
checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
"Platform");
/*
Replace empty value of platformVendor with device vendor
name
*/
platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
&platformVendor);
/* Properties for Context constructor (Use unknown) */
cl_context_properties cprops[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties) (platformList[0]) (),
0
};
/* Create context */
cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
&err);
/* Display potential Context constructor error */
checkErr(err, "Context");
/* Create buffer for matrix A */
cl::Buffer deviceMemA(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);
/* Create buffer for matrix B */
cl::Buffer deviceMemB(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);
/* Create buffer for matrix C */
cl::Buffer deviceMemC(context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);
/* Create buffer for row (A) and col (C) */
cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numRowsA, &err);
cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numColsC, &err);
/* Display potential Buffer constructor error */
checkErr(err, "Buffers");
/* Get list of devices */
cl::vector<cl::Device> devices =
context.getInfo<CL_CONTEXT_DEVICES>();
/* Check for at least one device, if not throw error */
checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");
/* Read input from .cl file */
ifstream file("matrixMult1_kernels.cl");
/* Check for potential problem opening .cl input file */
checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");
/* Store file contents in a string */
string prog(istreambuf_iterator<char>(file),
(istreambuf_iterator<char>()));
/* Create source object */
cl::Program::Sources source(1, make_pair(prog.c_str(),
prog.length()+1));
/* Create program for given context and source */
cl::Program program(context, source);
err = program.build(devices, ""); // Check for build error
/* Display potential program build error */
checkErr(err, "Program Build");
/* Create kernel */
cl::Kernel kernel(program, "matrixMul", &err);
/* Display potential Kernel constructor error */
checkErr(err, "Kernel");
/*
Set matrixMul arguments, error checking after each
argument
*/
err = kernel.setArg(0, deviceMemA);
checkErr(err, "Arg0");
err = kernel.setArg(1, deviceMemB);
checkErr(err, "Arg1");
err = kernel.setArg(2, deviceMemC);
checkErr(err, "Arg2");
err = kernel.setArg(3, rowA);
checkErr(err, "Arg3");
err = kernel.setArg(4, colC);
checkErr(err, "Arg4");
/* Create command queue */
cl::CommandQueue queue(context, devices[0], 0, &err);
/* Display potential CommandQueue constructor error */
checkErr(err, "Command Queue");
/* Create event object */
cl::Event event;
cl::NDRange global(3, 3);
cl::NDRange local(1, 1);
/* Enqueue the kernel */
err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
NULL, &event);
/* Display potential enqueueing error */
checkErr(err, "Enqueue");
/* Wait until kernel has completed execution before continuing */
event.wait();
/* Read kernel result back into host memory */
err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
blockC, NULL, &event);
checkErr(err, "C");
err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
blockA, NULL, &event);
err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
blockB, NULL, &event);
/* Display potential kernel read error */
checkErr(err, "Read Buffer");
/* Display matrices */
cout << endl;
cout << "After:" << endl;
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
printMatrix("Matrix C", blockC, sizeC, numColsC);
/* Free up memory */
free(blockA);
free(blockB);
free(blockC);
}
}
//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
// host program.
//
// PRE: err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
// indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {
/* Check error code against OpenCL success constant */
if (err != CL_SUCCESS) {
/*
Display an error message stating the error origin and
error number.
*/
std::cerr << "ERROR: " << name << " (" << err << ")"
<< std::endl;
exit(EXIT_FAILURE); // Terminates process with status code 0
}
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; i++) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float) RAND_MAX;
}
}
//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE: displayName is a string; matrix is a pointer to a block of
// bytes in memory; numIndices an integer indicating the number
// of indices in the matrix being displayed (read left-to-right,
// top-to-bottom); rowSize is an integer indicating the number
// of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize) {
/* Output display name of matrix */
cout << "\n" << displayName << ":" << endl;
/* Loop through each indice of the matrix */
for (int i = 0; i < numIndices; i++) {
cout << matrix[i]; // Display value at this indice
/* Check for next row of the matrix */
if (((i + 1) % rowSize) == 0) {
cout << endl; // Line break
} else {
cout << " | "; // Indice separator
}
}
}
Kernel:
// matrixMult1_kernels.cl
// Multiply two matrices A * B = C
// Device code.
// OpenCL Kernel
__kernel void
matrixMul(__global float* A,
__global float* B,
__global float* C,
int wA, int wB) {
// 2D Thread ID
int tx = get_local_id(0);
int ty = get_local_id(1);
// value stores the element
// that is computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
// Write the matrix to device memory each
// thread writes one element
C[ty * wA + tx] = value;
}
Sample Output:
Matrix A:
0.398748 | 0.999793 | 0.206833
0.354238 | 0.674347 | 0.492022
0.707017 | 0.353635 | 0.430668
Matrix B:
0.91598 | 0.0260167 | 0.881732
0.810974 | 0.193091 | 0.589857
0.229151 | 0.0657822 | 0.965835
ERROR: C (-30)
I'm working with an NVIDIA GeForce 9800 GT, which only supports OpenCL 1.1. Any help here would be much appreciated.
Thanks,
Joe