I am able to dump stuff from R32G32B32A32 image for screenshot. I would like to read out a pixel from R32G32_SFLOAT image as well. But the result look weird.
below is my working image dump code(no validation error)
void DumpImageToFile(VkTool::VulkanDevice &device, VkQueue graphics_queue, VkTool::Wrapper::CommandBuffers &command_buffer, VkImage image, uint32_t width, uint32_t height, const char *filename)
{
auto image_create_info = VkTool::Initializer::GenerateImageCreateInfo(VK_IMAGE_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM, {width, height, 1},
VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT, VK_SAMPLE_COUNT_1_BIT);
VkTool::Wrapper::Image staging_image(device, image_create_info, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT);
auto buffer_create_info = VkTool::Initializer::GenerateBufferCreateInfo(width * height * 4, VK_BUFFER_USAGE_TRANSFER_DST_BIT);
VkTool::Wrapper::Buffer staging_buffer(device, buffer_create_info, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
// Copy texture to buffer
command_buffer.Begin();
auto image_memory_barrier = VkTool::Initializer::GenerateImageMemoryBarrier(VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
{ VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }, staging_image.Get());
device.vkCmdPipelineBarrier(command_buffer.Get(), VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0
, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
image_memory_barrier = VkTool::Initializer::GenerateImageMemoryBarrier(VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
{ VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }, image);
device.vkCmdPipelineBarrier(command_buffer.Get(), VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0
, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
// Copy!!
VkImageBlit region = {};
region.srcSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 };
region.srcOffsets[0] = { 0, 0, 0 };
region.srcOffsets[1] = { static_cast<int32_t>(width), static_cast<int32_t>(height), 1};
region.dstSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 };
region.dstOffsets[0] = { 0, 0, 0 };
region.dstOffsets[1] = { static_cast<int32_t>(width), static_cast<int32_t>(height), 1 };
device.vkCmdBlitImage(command_buffer.Get(), image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, staging_image.Get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion, VK_FILTER_LINEAR);
image_memory_barrier = VkTool::Initializer::GenerateImageMemoryBarrier(VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
{ VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }, image);
device.vkCmdPipelineBarrier(command_buffer.Get(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, 0
, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
image_memory_barrier = VkTool::Initializer::GenerateImageMemoryBarrier(VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
{ VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }, staging_image.Get());
device.vkCmdPipelineBarrier(command_buffer.Get(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0
, 0, nullptr, 0, nullptr, 1, &image_memory_barrier);
auto buffer_image_copy = VkTool::Initializer::GenerateBufferImageCopy({ VK_IMAGE_ASPECT_COLOR_BIT , 0, 0, 1 }, { width, height, 1 });
device.vkCmdCopyImageToBuffer(command_buffer.Get(), staging_image.Get(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, staging_buffer.Get(), 1, &buffer_image_copy);
command_buffer.End();
std::vector<VkCommandBuffer> raw_command_buffers = command_buffer.GetAll();
auto submit_info = VkTool::Initializer::GenerateSubmitInfo(raw_command_buffers);
VkTool::Wrapper::Fence fence(device);
device.vkQueueSubmit(graphics_queue, 1, &submit_info, fence.Get());
fence.Wait();
fence.Destroy();
const uint8_t *mapped_address = reinterpret_cast<const uint8_t *>(staging_buffer.MapMemory());
lodepng::encode(filename, mapped_address, width, height);
staging_buffer.UnmapMemory();
staging_image.Destroy();
staging_buffer.Destroy();
}
Sorry for the ugly self-made wrapper, there was no official wrapper. Basically, it creates a staging image and buffer. first copy from source image to staging image with vkCmdBlitImage. then use vkCmdCopyImageToBuffer and map the buffer to host memory. This method works on multiple gpus and it does not need to worry about padding.(I guess, correct me if I am wrong).
However, I have no luck to use this method to read R32G32_SFLOAT. at first I thought it was because of endianness until I dump the whole image out.
The image above is I directly convert R32G32_SFLOAT to R8G8B8A8_UNORM, I know it does not make sense. But without changing format, there's still a lot of "hole" in the image and values are deadly wrong.

