7
votes

I'm really new to DirectCompute technologies, and have been attempting to learn from the documentation on the msdn website, which is.. dense, to say the least.

I'd like to make a basic hlsl file that takes in a 4x4 matrix and a 4xN matrix and returns the multiplied result. But after spending some time playing with the code, I've found some weird stuff I don't understand - mainly with how the threads I pass in process the buffers and output data.

With all of these examples, I pass in two 16 float buffers and get out a 16 float buffer and then Dispatch with a 4x1x1 grouping - I can show you code, but I honestly dont yet know what would help you help me. Let me know if there's a section of my C++ code you want to see.

with the following code:

StructuredBuffer<float4x4> base_matrix     : register(t0); // byteWidth = 64
StructuredBuffer<float4>   extended_matrix : register(t1); // byteWidth = 64
RWStructuredBuffer<float4> BufferOut       : register(u0); // byteWidth = 64, zeroed out before reading from the GPU

[numthreads(1, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
    BufferOut[DTid.x].x = 1;
}

I get the following values out:

1.000 0.000 0.000 0.000
1.000 0.000 0.000 0.000
1.000 0.000 0.000 0.000
1.000 0.000 0.000 0.000

This makes sense to me - the buffer is parsed as 4 threads, each executing 1 float4 grouping.

with the following code:

StructuredBuffer<float4x4> base_matrix     : register(t0); // byteWidth = 64
StructuredBuffer<float4>   extended_matrix : register(t1); // byteWidth = 64
RWStructuredBuffer<float4> BufferOut       : register(u0); // byteWidth = 64, zeroed out before reading from the GPU

[numthreads(1, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
    BufferOut[DTid.x].x = 1;
    BufferOut[DTid.x].y = 2;
    BufferOut[DTid.x].z = 3;
    BufferOut[DTid.x].w = 4;
}

I get the following values out:

1.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000
1.000 1.000 1.000 1.000

and with the actual code I want to run:

StructuredBuffer<float4x4> base_matrix     : register(t0);
StructuredBuffer<float4>   extended_matrix : register(t1);
RWStructuredBuffer<float4> BufferOut       : register(u0);

[numthreads(1, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
    BufferOut[DTid.x] = mul(base_matrix[0],extended_matrix[DTid.x])
}

I get the following values out:

0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000

I can tell I'm missing a critical thing here, but for the life of me I cant find the appropriate documentation telling me how these work. Could someone help me understand whats going on in this code?

Thanks for your time,

Zach

As another note, this code was cribbed together using the Microsoft DirectX SDK (June 2010)\Samples\C++\Direct3D11\BasicCompute11 Sample available. If I'm doing something terribly wrong, feel free to let me know. I'm REALLY new at HLSL.

Edit: My buffer creation code.

CreateStructuredBuffer( g_pDevice, sizeof(float)*16,     1,            g_matrix,          &g_pBuf0 );
CreateStructuredBuffer( g_pDevice, sizeof(float)*4,      NUM_ELEMENTS, g_extended_matrix, &g_pBuf1 );
CreateStructuredBuffer( g_pDevice, sizeof(float)*4,      NUM_ELEMENTS, NULL,              &g_pBufResult );

//--------------------------------------------------------------------------------------
// Create Structured Buffer
//--------------------------------------------------------------------------------------
HRESULT CreateStructuredBuffer( ID3D11Device* pDevice, UINT uElementSize, UINT uCount, VOID* pInitData, ID3D11Buffer** ppBufOut )
{
    *ppBufOut = NULL;

    D3D11_BUFFER_DESC desc;
    ZeroMemory( &desc, sizeof(desc) );
    desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
    desc.ByteWidth = uElementSize * uCount;
    desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
    desc.StructureByteStride = uElementSize;

    if ( pInitData )
    {
        D3D11_SUBRESOURCE_DATA InitData;
        InitData.pSysMem = pInitData;
        return pDevice->CreateBuffer( &desc, &InitData, ppBufOut );
    } else
        return pDevice->CreateBuffer( &desc, NULL, ppBufOut );
}

Trying .1,.2,.3,.4 ...

StructuredBuffer<float4x4> base_matrix     : register(t0);
StructuredBuffer<float4>   extended_matrix : register(t1);
StructuredBuffer<uint>     loop_multiplier : register(t2);
RWStructuredBuffer<float4> BufferOut       : register(u0);

[numthreads(1, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
    BufferOut[DTid.x].x = .1;
    BufferOut[DTid.x].y = .2;
BufferOut[DTid.x].z = .3;
BufferOut[DTid.x].w = .4;
}

got this out:

0.100 0.100 0.100 0.100
0.100 0.100 0.100 0.100
0.100 0.100 0.100 0.100
0.100 0.100 0.100 0.100
1
Could you post the code where you create your ID3D11Buffers and the corresponding ID3D11ShaderResourceView and ID3D11UnorderedAccessView objects, including the contents of the various *_DESC objects used to create them? Also, random hunch: can you try writing out 0.1, 0.2, 0.3, 0.4 instead of 1,2,3,4 in the second example?postgoodism
I'll post this in the morning (eta 8 hours). Didnt catch this until later. But thanks in advance!Zach H
@postgoodism : Added my buffer creation code. Let me know if you need anything else.Zach H
The code to create your unordered access view is still missing; that's the last piece of the puzzle.postgoodism
Never mind, I assume it's unmodified from BasicCompute11. I was able to get the expected/correct results when I made the same modifications to the sample (modified call to CreateStructuredBuffer to use uElementSize=16; changed element type of BufferOut in the shader to float4 instead of BufType; changed body of CSMain() to write 1,2,3,4 to BufferOut[DTid.x].xyzw. If you go back to a fresh copy of the sample and apply just those three changes, do you get the correct results?postgoodism

1 Answers

0
votes

I have tried your way,but I got a correct result. I can't add the comment because of my little reputation. Here is my code.

HLSL:

RWStructuredBuffer Output:register(u0);

[numthreads(1, 1, 1)]

void main( uint3 DTid : SV_DispatchThreadID )

{ if (DTid.x > 4)

    return;

Output[DTid.x].x= 1.f;

Output[DTid.x].y = 2.f;

Output[DTid.x].z = 3.f;

Output[DTid.x].w = 4.f;

}

C++:

define PathName

L"C:\Users\e\Desktop\D3D_Reseach\RenderPro\x64\Debug\ComputeShader.cso"

struct Buffer

{

XMFLOAT4 Test;

};

int APIENTRY wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPTSTR

lpCmdLine, int nCmdShow)

{

Hardware HardWare;

WinSystem Win;

Win.CreateWindows(HardWare, 400, 300);

ShowWindow(Win.hwnd, SW_HIDE);

//UAV

SharedComPtr<ID3D11UnorderedAccessView> Resource;

SharedComPtr<ID3D11Buffer>                           _Buffer;

ShaderResourceView::STRUCT_BUUFER_DESC Desc;

Desc.ACCESS = 0;

Desc.BIND = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;

Desc.FORMAT = DXGI_FORMAT_UNKNOWN;

Desc.HasScr = false;

Desc.MISC = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;

Desc.USAGE = D3D11_USAGE_DEFAULT;

Desc.ByteWidth= 4 * sizeof(Buffer);

Desc.StructureByteStride= sizeof(Buffer);

Desc.UAV_OR_SRV = ShaderResourceView::UAV;

ShaderResourceView::CreateStructBuffer(HardWare.GetD3DDevice(),Desc, nullptr,Resource.GetTwoLevel(), _Buffer.GetTwoLevel(),true);

//CompilerShader

SharedComPtr<ID3D11ComputeShader>        ComputerSahder;

SharedComPtr<ID3DBlob>                               Blob;

WCHAR *Name = PathName;

CompilerShader::CompileShaderFromBinary(ComputerSahder.GetTwoLevel(), Name, HardWare.GetD3DDevice(),
                                                                 Blob.GetTwoLevel(), CompilerShader::ShaderFlag::ComputeShader);

//Set ComputerHlsl

HardWare.GetDeviceContext()->CSSetUnorderedAccessViews(0, 1,

Resource.GetTwoLevel(), 0);

HardWare.GetDeviceContext()->CSSetShader(ComputerSahder.Get(), 0, 0);

HardWare.GetDeviceContext()->Dispatch(4, 1, 1);

//SRV

Buffer Hy[4];

VOID *P = Hy;

ID3D11Buffer* pBuffer;

BufferSystem::CreateConstanceBuffer(HardWare.GetD3DDevice(), P, pBuffer, 

Desc.ByteWidth, D3D11_USAGE_STAGING);

HardWare.GetDeviceContext()->CopyResource(pBuffer, _Buffer.Get());

D3D11_MAPPED_SUBRESOURCE Data;

HardWare.GetDeviceContext()->Map(pBuffer, 0, D3D11_MAP_READ, 0, &Data);

Buffer *PP = reinterpret_cast<Buffer*>(Data.pData);

for (UINT i = 0; i < 4; ++i) {

    float a = PP[i].Test.x;

    a = PP[i].Test.y;

    a = PP[i].Test.z;

    a = PP[i].Test.w;

    a = PP[i].Test.w;

}

}