I'm still new to OpenCL, I was doing some tests with Nvidia examples, the whole program consists of 5 kernels, and these kernels execute in an order (1,2,3,4,5).
The first kernel simple takes position data, velocity data, and applies gravity and basic collision detection, then adjust that position and velocity ... this kernel works perfect without problems.
Here is the first kernel:
__kernel void integrate(
__global float4 *d_Pos, //input/output
__global float4 *d_Vel, //input/output
__constant simParams_t *params,
float deltaTime,
uint numParticles
){
const uint index = get_global_id(0);
if(index >= numParticles)
return;
float4 pos = d_Pos[index];
float4 vel = d_Vel[index];
pos.w = 1.0f;
vel.w = 0.0f;
//Gravity
vel += (float4)(params->gravity.x, params->gravity.y, params->gravity.z, 0) * deltaTime;
vel *= params->globalDamping;
//Advance pos
pos += vel * deltaTime;
//Collide with cube
if(pos.x < -1.0f + params->particleRadius){
pos.x = -1.0f + params->particleRadius;
vel.x *= params->boundaryDamping;
}
if(pos.x > 1.0f - params->particleRadius){
pos.x = 1.0f - params->particleRadius;
vel.x *= params->boundaryDamping;
}
if(pos.y < -1.0f + params->particleRadius){
pos.y = -1.0f + params->particleRadius;
vel.y *= params->boundaryDamping;
}
if(pos.y > 1.0f - params->particleRadius){
pos.y = 1.0f - params->particleRadius;
vel.y *= params->boundaryDamping;
}
if(pos.z < -1.0f + params->particleRadius){
pos.z = -1.0f + params->particleRadius;
vel.z *= params->boundaryDamping;
}
if(pos.z > 1.0f - params->particleRadius){
pos.z = 1.0f - params->particleRadius;
vel.z *= params->boundaryDamping;
}
//Store new position and velocity
d_Pos[index] = pos;
d_Vel[index] = vel;
}
The second kernel is taking these positions as input and outputs another kind of data (some indices) but it doesn't change position data.
The third kernel is doing adjustments to second kernel outputs (takes data from second kernel which doesn't touch position data).
Now for the problem ...4th kernel; This takes position data and velocity data(from first kernel) ,takes adjusted data from third kernel,outputs another position and velocity data (totally different pointer for these position and velocity)
Here is the fourth kernel:
__kernel void findCellBoundsAndReorder(
__global uint *d_CellStart, //output: cell start index
__global uint *d_CellEnd, //output: cell end index
__global float4 *d_ReorderedPos, //output: reordered by cell hash positions
__global float4 *d_ReorderedVel, //output: reordered by cell hash velocities
__global const uint *d_Hash, //input: sorted grid hashes
__global const uint *d_Index, //input: particle indices sorted by hash
__global const float4 *d_Pos, //input: positions array sorted by hash
__global const float4 *d_Vel, //input: velocity array sorted by hash
__local uint *localHash, //get_group_size(0) + 1 elements
uint numParticles
){
uint hash;
const uint index = get_global_id(0);
//Handle case when no. of particles not multiple of block size
if(index < numParticles){
hash = d_Hash[index];
//Load hash data into local memory so that we can look
//at neighboring particle's hash value without loading
//two hash values per thread
localHash[get_local_id(0) + 1] = hash;
//First thread in block must load neighbor particle hash
if(index > 0 && get_local_id(0) == 0)
localHash[0] = d_Hash[index - 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index < numParticles){
//Border case
if(index == 0)
d_CellStart[hash] = 0;
//Main case
else{
if(hash != localHash[get_local_id(0)])
d_CellEnd[localHash[get_local_id(0)]] = d_CellStart[hash] = index;
};
//Another border case
if(index == numParticles - 1)
d_CellEnd[hash] = numParticles;
//Now use the sorted index to reorder the pos and vel arrays
uint sortedIndex = d_Index[index];
float4 pos = d_Pos[sortedIndex];
float4 vel = d_Vel[sortedIndex];
d_ReorderedPos[index] = pos;
d_ReorderedVel[index] = vel;
}
}
The problem is if I execute kernel 1 alone (or 1+2,or 1+2+3) positions and velocities are adjusted correctly from first kernel.
But if I execute kernel 1+2+3+4 (though kernel 4 doesn't change input data), the data remains the same (as if i didn't execute anything ...positions are not adjusted).