I'm trying to synchronize a host stage into my pipeline, where I basically edit some data on the host during the execution of a command buffer on the device. From reading the specification I think I'm doing the correct synchronization, execution/memory dependencies and availability/visibility operations, but it neither works on NV nor AMD hardware. Is this even possible? If so, what am I doing wrong in terms of synchronization?
In summary I'm doing the following:
- [D] A device buffer (
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) is copied to a host visible and coherent one (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT). - [D] The first event is set.
- [D] The second event is waited for.
- [H] Meanwhile the host waits for the first event.
- [H] After it has been set, it increments the numbers in the host visible buffer.
- [H] Then it sets the second event.
- [D] The device then continues to copy the host visible buffer back to the device local buffer.
What happens?
On NV the first part works, the correct data arrives at the host side, but the altered data never arrives at the device side. On AMD not even the first part works and I already don't get the data on the host.
Command buffer recording:
// ...
VkMemoryBarrier barrier = {};
barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
barrier.srcAccessMask = ...;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
vkCmdPipelineBarrier(command_buffer, ..., VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
copyWholeBuffer(command_buffer, host_buffer, device_buffer);
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
vkCmdSetEvent(command_buffer, device_to_host_sync_event, VK_PIPELINE_STAGE_TRANSFER_BIT);
barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
vkCmdWaitEvents(command_buffer, 1, &host_to_device_sync_event, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 1, &barrier, 0, nullptr, 0, nullptr);
copyWholeBuffer(command_buffer, device_buffer, host_buffer);
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier.dstAccessMask = ...;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, ..., 0, 1, &barrier, 0, nullptr, 0, nullptr);
// ...
Execution
vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE);
while(vkGetEventStatus(device, device_to_host_sync_event) != VK_EVENT_SET)
std::this_thread::sleep_for(std::chrono::microseconds(10));
void* data;
vkMapMemory(device, host_buffer, 0, BUFFER_SIZE, 0, &data);
// read and write parts of the memory
vkUnmapMemory(device, host_buffer);
vkSetEvent(device, host_to_device_sync_event);
vkDeviceWaitIdle(device);
I've uploaded a working example: https://gist.github.com/neXyon/859b2e52bac9a5a56b804d8a9d5fa4a5
The interesting bits start at line 292! Please have a look if it works for you?