0
votes

I have the following OpenCl code:

kernel void vectorAddition(global read_only int* vector1, global read_only int* vector2, global write_only int* vector3)
{
    int indx = get_global_id(0);
    vector3[0] = vector1[0] + vector2[0];
}

and the host code:

# define __CL_ENABLE_EXCEPTIONS

#include <CL/cl.hpp>
#include <fstream>
#include <iostream>
#include <vector>
#include <cstdint>
#include <exception>


int main()
{
    try
    {
        const long N_elements = 10;

        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);

        std::vector<cl::Device> devices;
        platforms[1].getDevices(CL_DEVICE_TYPE_ALL, &devices);

        std::ifstream helloWorldFile("VectorAddition.cl");
        std::string src(std::istreambuf_iterator<char>(helloWorldFile), (std::istreambuf_iterator<char>()));

        cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));

        cl::Context context(devices);

        cl::Program program(context, sources);

        cl_int err = program.build(devices, "-cl-std=CL1.2");

        int* vector1 = new int[N_elements];
        for (int i = 0; i < N_elements; i++) vector1[i] = 1;

        for (long i = 0; i < N_elements; i++) std::cout << vector1[i] << " ";
        std::cout << std::endl;

        cl::Buffer vec1Buff(context, CL_MEM_READ_ONLY, sizeof(int) * N_elements, vector1);

        int* vector2 = new int[N_elements];
        for (int i = 0; i < N_elements; i++) vector2[i] = 2;

        for (long i = 0; i < N_elements; i++) std::cout << vector2[i] << " ";
        std::cout << std::endl;

        cl::Buffer vec2Buff(context, CL_MEM_READ_ONLY, sizeof(int) * N_elements, vector2);

        int* vector3 = new int[N_elements];
        cl::Buffer vec3Buff(context, CL_MEM_WRITE_ONLY, sizeof(int) * N_elements, vector3);

        cl::Kernel kernel(program, "vectorAddition", &err);
        kernel.setArg(0, vec1Buff);
        kernel.setArg(1, vec2Buff);
        kernel.setArg(2, vec3Buff);

        cl::CommandQueue queue(context, devices[0]);
        queue.enqueueWriteBuffer(vec1Buff, CL_TRUE, 0, sizeof(int) * N_elements, vector1);
        queue.enqueueWriteBuffer(vec2Buff, CL_TRUE, 0, sizeof(int) * N_elements, vector2);

        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(N_elements), cl::NDRange(10));

        queue.enqueueReadBuffer(vec3Buff, CL_TRUE, 0, sizeof(int) * N_elements, vector3);

        for (long i = 0; i < N_elements; i++) std::cout << vector3[i] << " ";

        delete[] vector1;
        delete[] vector2;
        delete[] vector3;
    }
    catch (cl::Error error)
    {
        std::cout << error.what() << "(" << error.err() << ")" << std::endl;
    }

    std::cin.get();

    return 0;
}

I'v got the following output:

1 1 1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2 2 2
3 0 0 0 0 0 0 0 0 0

Why is there only the first item calculated ?

Specifications of my GPU:

platform name: NVIDIA CUDA, platform profile: FULL_PROFILE, platform version: OpenCL 1.2 CUDA 8.0.0, platform vendor: NVIDIA Corporation, platform extensions: cl_khr_global_int32_base_atomics, cl_khr_global_int32_extended_atomics, cl_khr_local_int32_base_atomics, cl_khr_local_int32_extended_atomics, cl_khr_fp64, cl_khr_byte_addressable_store, cl_khr_icd cl_khr_gl_sharing, cl_nv_compiler_options, cl_nv_device_attribute_query, cl_nv_pragma_unroll, cl_nv_d3d9_sharing, cl_nv_d3d10_sharing, cl_khr_d3d10_sharing, cl_nv_d3d11_sharing, cl_nv_copy_opts, cl_nv_create_buffer,

device name: GeForce 820M, device vendor: NVIDIA Corporation, device type: 4, device max compute units: 2, device max work item dimensions: 3, device max work item sizes: 1024 1024 64, device max workgroup size: 1024, device max clk freq: 1250, device addr bits: 64, device max mem allocation size: 536870912, device image support: 1

1

1 Answers

4
votes
vector3[0] = vector1[0] + vector2[0];

All the items in your kernel sum and save to the same position, 0. You should use the id of the item instead, so all of them work on different data:

kernel void vectorAddition(global read_only int* vector1, global read_only int* vector2, global write_only int* vector3)
{
    int indx = get_global_id(0);
    vector3[indx] = vector1[indx] + vector2[indx];
}